restet 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,30 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ coverage
6
+ InstalledFiles
7
+ lib/bundler/man
8
+ pkg
9
+ rdoc
10
+ spec/reports
11
+ test/tmp
12
+ test/version_tmp
13
+ tmp
14
+
15
+ # YARD artifacts
16
+ .yardoc
17
+ _yardoc
18
+ doc/
19
+
20
+ # OSX
21
+ .DS_Store
22
+
23
+ # Thumbnails
24
+ ._*
25
+
26
+ # Files that might appear on external disk
27
+ .Spotlight-V100
28
+ .Trashes
29
+
30
+ .redcar
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in restet.gemspec
4
+ gemspec
@@ -0,0 +1,30 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ restet (0.0.1)
5
+ multipart-post
6
+ sinatra
7
+ vegas
8
+ yajl-ruby
9
+
10
+ GEM
11
+ remote: http://rubygems.org/
12
+ specs:
13
+ multipart-post (1.1.4)
14
+ rack (1.3.5)
15
+ rack-protection (1.1.4)
16
+ rack
17
+ sinatra (1.3.1)
18
+ rack (~> 1.3, >= 1.3.4)
19
+ rack-protection (~> 1.1, >= 1.1.2)
20
+ tilt (~> 1.3, >= 1.3.3)
21
+ tilt (1.3.3)
22
+ vegas (0.1.8)
23
+ rack (>= 1.0.0)
24
+ yajl-ruby (1.1.0)
25
+
26
+ PLATFORMS
27
+ ruby
28
+
29
+ DEPENDENCIES
30
+ restet!
data/README ADDED
@@ -0,0 +1,6 @@
1
+ This is currently a very basic way to expose PDFlib TET cli functions.
2
+
3
+ Right now, I only need a small subset of features, but as I need more I will implement additional
4
+ extraction parameters and resources.
5
+
6
+ TET can be found here: http://www.pdflib.com/products/tet/
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task :default => :test
4
+
5
+ require 'rake/testtask'
6
+ Rake::TestTask.new(:test) do |test|
7
+ test.libs << 'test'
8
+ test.pattern = 'test/**/*_test.rb'
9
+ test.verbose = true
10
+ end
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+
5
+ require 'restet'
6
+ require 'restet/client'
7
+ require 'uri'
8
+
9
+ filename = ARGV[0]
10
+
11
+ def get_restet_uri_string
12
+ return ENV['RESTET_URI'] if ENV['RESTET_URI']
13
+ return File.read('~/.restet').strip if File.exist?('~/.restet')
14
+ 'http://localhost:5679/extractions'
15
+ end
16
+
17
+ abort unless File.exist?(filename)
18
+
19
+ File.open(filename) do |file|
20
+ client = Restet::Client.new(URI.parse(get_restet_uri_string))
21
+ extraction = client.extract(file) do |params|
22
+ params.slice :slice, [ARGV[1], ARGV[2]], [ARGV[3], ARGV[4]]
23
+ end
24
+ puts extraction[:slice]
25
+ end
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ begin
5
+ require 'vegas'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'vegas'
9
+ end
10
+
11
+ require 'restet'
12
+ require 'restet/server'
13
+
14
+
15
+ Vegas::Runner.new(Restet::Server, 'restet-web')
@@ -0,0 +1,8 @@
1
+ require 'yajl'
2
+ require 'yajl/json_gem'
3
+ require 'restet/version'
4
+
5
+
6
+ module Restet
7
+
8
+ end
@@ -0,0 +1,25 @@
1
+ require 'net/http'
2
+ require 'net/http/post/multipart'
3
+ require 'restet/client/extraction_params'
4
+
5
+ module Restet
6
+ class Client
7
+ def initialize(uri)
8
+ @uri = uri
9
+ end
10
+
11
+ def extract(file)
12
+ params = ExtractionParams.new(file)
13
+ yield params if block_given?
14
+ Net::HTTP.start(@uri.host, @uri.port) do |http|
15
+ response = http.request build_request(file, params)
16
+ return JSON.parse(response.body, :symbolize_keys => true)
17
+ end
18
+ end
19
+
20
+ def build_request(file, params)
21
+ upload_io = UploadIO.new(file, 'application/pdf')
22
+ Net::HTTP::Post::Multipart.new(@uri.path, :pdf => upload_io, :extraction => params.to_json)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,19 @@
1
+ module Restet
2
+ class Client
3
+ class ExtractionParams
4
+ attr_reader :file, :slices
5
+
6
+ def initialize(file, slices = {})
7
+ @file, @slices = file, slices
8
+ end
9
+
10
+ def slice(name, bottom_left, top_right)
11
+ @slices[name.to_sym] = {:bottom_left => bottom_left, :top_right => top_right}
12
+ end
13
+
14
+ def to_json
15
+ JSON.dump(@slices)
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ require 'sinatra/base'
2
+
3
+ module Restet
4
+ class Server < Sinatra::Base
5
+ require 'restet/server/command'
6
+ dir = File.dirname(File.expand_path(__FILE__))
7
+
8
+ get '/version' do
9
+ Restet::VERSION
10
+ end
11
+
12
+ post '/extractions' do
13
+ slices = JSON.parse(params[:extraction])
14
+ extractions = {}
15
+ slices.each do |name, slice|
16
+ extractions[name] = Command.new(params[:pdf][:tempfile], slice).execute
17
+ end
18
+ JSON.dump(extractions)
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,20 @@
1
+ module Restet
2
+ class Server
3
+ class Command
4
+ def initialize(file, slice)
5
+ puts file.class.name
6
+ @file, @slice = file, slice
7
+ end
8
+
9
+ def pageopt
10
+ "includebox={{#{@slice['bottom_left'].join(' ')} #{@slice['top_right'].join(' ')}}}"
11
+ end
12
+
13
+ def execute
14
+ cmd = "tet --text --pageopt \"#{pageopt}\" --outfile - #{File.realpath(@file)}"
15
+ result = `#{cmd}`
16
+ result.gsub(/PDFlib TET: PDFlib Text Extraction Toolkit, 4.0p2\n\(c\) 2002-2010 PDFlib GmbH www.pdflib.com sales@pdflib.com\n/, '')
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module Restet
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "restet/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "restet"
7
+ s.version = Restet::VERSION
8
+ s.authors = ["Scott Holden"]
9
+ s.email = ["ssh@sshconnection.com"]
10
+ s.homepage = "http://github.com/sholden/restet"
11
+ s.summary = %q{A RESTful API exposing PDFlib TET functionality}
12
+ s.description = %q{Provides a very simple extraction resource for extracing text from slices of a PDF.}
13
+
14
+ s.rubyforge_project = "restet"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_runtime_dependency 'multipart-post'
22
+ s.add_runtime_dependency 'yajl-ruby'
23
+ s.add_runtime_dependency 'sinatra'
24
+ s.add_runtime_dependency 'vegas'
25
+ end
@@ -0,0 +1,6 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'test_helper'))
2
+ require 'restet/client'
3
+
4
+ class ClientTest < Restet::TestCase
5
+
6
+ end
@@ -0,0 +1,12 @@
1
+ dir = File.dirname(File.expand_path(__FILE__))
2
+ $LOAD_PATH.unshift dir + '/../lib'
3
+ $TESTING = true
4
+
5
+ require 'rubygems'
6
+ require 'test/unit'
7
+ require 'restet'
8
+
9
+ module Restet
10
+ class TestCase < Test::Unit::TestCase
11
+ end
12
+ end
metadata ADDED
@@ -0,0 +1,110 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: restet
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Scott Holden
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-12-01 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: multipart-post
16
+ requirement: &70229104384680 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70229104384680
25
+ - !ruby/object:Gem::Dependency
26
+ name: yajl-ruby
27
+ requirement: &70229104383680 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *70229104383680
36
+ - !ruby/object:Gem::Dependency
37
+ name: sinatra
38
+ requirement: &70229104382320 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *70229104382320
47
+ - !ruby/object:Gem::Dependency
48
+ name: vegas
49
+ requirement: &70229104381820 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *70229104381820
58
+ description: Provides a very simple extraction resource for extracing text from slices
59
+ of a PDF.
60
+ email:
61
+ - ssh@sshconnection.com
62
+ executables:
63
+ - restet-cli
64
+ - restet-web
65
+ extensions: []
66
+ extra_rdoc_files: []
67
+ files:
68
+ - .gitignore
69
+ - Gemfile
70
+ - Gemfile.lock
71
+ - README
72
+ - Rakefile
73
+ - bin/restet-cli
74
+ - bin/restet-web
75
+ - lib/restet.rb
76
+ - lib/restet/client.rb
77
+ - lib/restet/client/extraction_params.rb
78
+ - lib/restet/server.rb
79
+ - lib/restet/server/command.rb
80
+ - lib/restet/version.rb
81
+ - restet.gemspec
82
+ - test/client_test.rb
83
+ - test/test_helper.rb
84
+ homepage: http://github.com/sholden/restet
85
+ licenses: []
86
+ post_install_message:
87
+ rdoc_options: []
88
+ require_paths:
89
+ - lib
90
+ required_ruby_version: !ruby/object:Gem::Requirement
91
+ none: false
92
+ requirements:
93
+ - - ! '>='
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ required_rubygems_version: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ requirements: []
103
+ rubyforge_project: restet
104
+ rubygems_version: 1.8.10
105
+ signing_key:
106
+ specification_version: 3
107
+ summary: A RESTful API exposing PDFlib TET functionality
108
+ test_files:
109
+ - test/client_test.rb
110
+ - test/test_helper.rb