restet 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +30 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +30 -0
- data/README +6 -0
- data/Rakefile +10 -0
- data/bin/restet-cli +25 -0
- data/bin/restet-web +15 -0
- data/lib/restet.rb +8 -0
- data/lib/restet/client.rb +25 -0
- data/lib/restet/client/extraction_params.rb +19 -0
- data/lib/restet/server.rb +21 -0
- data/lib/restet/server/command.rb +20 -0
- data/lib/restet/version.rb +3 -0
- data/restet.gemspec +25 -0
- data/test/client_test.rb +6 -0
- data/test/test_helper.rb +12 -0
- metadata +110 -0
data/.gitignore
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
coverage
|
6
|
+
InstalledFiles
|
7
|
+
lib/bundler/man
|
8
|
+
pkg
|
9
|
+
rdoc
|
10
|
+
spec/reports
|
11
|
+
test/tmp
|
12
|
+
test/version_tmp
|
13
|
+
tmp
|
14
|
+
|
15
|
+
# YARD artifacts
|
16
|
+
.yardoc
|
17
|
+
_yardoc
|
18
|
+
doc/
|
19
|
+
|
20
|
+
# OSX
|
21
|
+
.DS_Store
|
22
|
+
|
23
|
+
# Thumbnails
|
24
|
+
._*
|
25
|
+
|
26
|
+
# Files that might appear on external disk
|
27
|
+
.Spotlight-V100
|
28
|
+
.Trashes
|
29
|
+
|
30
|
+
.redcar
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
restet (0.0.1)
|
5
|
+
multipart-post
|
6
|
+
sinatra
|
7
|
+
vegas
|
8
|
+
yajl-ruby
|
9
|
+
|
10
|
+
GEM
|
11
|
+
remote: http://rubygems.org/
|
12
|
+
specs:
|
13
|
+
multipart-post (1.1.4)
|
14
|
+
rack (1.3.5)
|
15
|
+
rack-protection (1.1.4)
|
16
|
+
rack
|
17
|
+
sinatra (1.3.1)
|
18
|
+
rack (~> 1.3, >= 1.3.4)
|
19
|
+
rack-protection (~> 1.1, >= 1.1.2)
|
20
|
+
tilt (~> 1.3, >= 1.3.3)
|
21
|
+
tilt (1.3.3)
|
22
|
+
vegas (0.1.8)
|
23
|
+
rack (>= 1.0.0)
|
24
|
+
yajl-ruby (1.1.0)
|
25
|
+
|
26
|
+
PLATFORMS
|
27
|
+
ruby
|
28
|
+
|
29
|
+
DEPENDENCIES
|
30
|
+
restet!
|
data/README
ADDED
@@ -0,0 +1,6 @@
|
|
1
|
+
This is currently a very basic way to expose PDFlib TET cli functions.
|
2
|
+
|
3
|
+
Right now, I only need a small subset of features, but as I need more I will implement additional
|
4
|
+
extraction parameters and resources.
|
5
|
+
|
6
|
+
TET can be found here: http://www.pdflib.com/products/tet/
|
data/Rakefile
ADDED
data/bin/restet-cli
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
|
4
|
+
|
5
|
+
require 'restet'
|
6
|
+
require 'restet/client'
|
7
|
+
require 'uri'
|
8
|
+
|
9
|
+
filename = ARGV[0]
|
10
|
+
|
11
|
+
def get_restet_uri_string
|
12
|
+
return ENV['RESTET_URI'] if ENV['RESTET_URI']
|
13
|
+
return File.read('~/.restet').strip if File.exist?('~/.restet')
|
14
|
+
'http://localhost:5679/extractions'
|
15
|
+
end
|
16
|
+
|
17
|
+
abort unless File.exist?(filename)
|
18
|
+
|
19
|
+
File.open(filename) do |file|
|
20
|
+
client = Restet::Client.new(URI.parse(get_restet_uri_string))
|
21
|
+
extraction = client.extract(file) do |params|
|
22
|
+
params.slice :slice, [ARGV[1], ARGV[2]], [ARGV[3], ARGV[4]]
|
23
|
+
end
|
24
|
+
puts extraction[:slice]
|
25
|
+
end
|
data/bin/restet-web
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
|
4
|
+
begin
|
5
|
+
require 'vegas'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'vegas'
|
9
|
+
end
|
10
|
+
|
11
|
+
require 'restet'
|
12
|
+
require 'restet/server'
|
13
|
+
|
14
|
+
|
15
|
+
Vegas::Runner.new(Restet::Server, 'restet-web')
|
data/lib/restet.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'net/http/post/multipart'
|
3
|
+
require 'restet/client/extraction_params'
|
4
|
+
|
5
|
+
module Restet
|
6
|
+
class Client
|
7
|
+
def initialize(uri)
|
8
|
+
@uri = uri
|
9
|
+
end
|
10
|
+
|
11
|
+
def extract(file)
|
12
|
+
params = ExtractionParams.new(file)
|
13
|
+
yield params if block_given?
|
14
|
+
Net::HTTP.start(@uri.host, @uri.port) do |http|
|
15
|
+
response = http.request build_request(file, params)
|
16
|
+
return JSON.parse(response.body, :symbolize_keys => true)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def build_request(file, params)
|
21
|
+
upload_io = UploadIO.new(file, 'application/pdf')
|
22
|
+
Net::HTTP::Post::Multipart.new(@uri.path, :pdf => upload_io, :extraction => params.to_json)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Restet
|
2
|
+
class Client
|
3
|
+
class ExtractionParams
|
4
|
+
attr_reader :file, :slices
|
5
|
+
|
6
|
+
def initialize(file, slices = {})
|
7
|
+
@file, @slices = file, slices
|
8
|
+
end
|
9
|
+
|
10
|
+
def slice(name, bottom_left, top_right)
|
11
|
+
@slices[name.to_sym] = {:bottom_left => bottom_left, :top_right => top_right}
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_json
|
15
|
+
JSON.dump(@slices)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'sinatra/base'
|
2
|
+
|
3
|
+
module Restet
|
4
|
+
class Server < Sinatra::Base
|
5
|
+
require 'restet/server/command'
|
6
|
+
dir = File.dirname(File.expand_path(__FILE__))
|
7
|
+
|
8
|
+
get '/version' do
|
9
|
+
Restet::VERSION
|
10
|
+
end
|
11
|
+
|
12
|
+
post '/extractions' do
|
13
|
+
slices = JSON.parse(params[:extraction])
|
14
|
+
extractions = {}
|
15
|
+
slices.each do |name, slice|
|
16
|
+
extractions[name] = Command.new(params[:pdf][:tempfile], slice).execute
|
17
|
+
end
|
18
|
+
JSON.dump(extractions)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Restet
|
2
|
+
class Server
|
3
|
+
class Command
|
4
|
+
def initialize(file, slice)
|
5
|
+
puts file.class.name
|
6
|
+
@file, @slice = file, slice
|
7
|
+
end
|
8
|
+
|
9
|
+
def pageopt
|
10
|
+
"includebox={{#{@slice['bottom_left'].join(' ')} #{@slice['top_right'].join(' ')}}}"
|
11
|
+
end
|
12
|
+
|
13
|
+
def execute
|
14
|
+
cmd = "tet --text --pageopt \"#{pageopt}\" --outfile - #{File.realpath(@file)}"
|
15
|
+
result = `#{cmd}`
|
16
|
+
result.gsub(/PDFlib TET: PDFlib Text Extraction Toolkit, 4.0p2\n\(c\) 2002-2010 PDFlib GmbH www.pdflib.com sales@pdflib.com\n/, '')
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/restet.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "restet/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "restet"
|
7
|
+
s.version = Restet::VERSION
|
8
|
+
s.authors = ["Scott Holden"]
|
9
|
+
s.email = ["ssh@sshconnection.com"]
|
10
|
+
s.homepage = "http://github.com/sholden/restet"
|
11
|
+
s.summary = %q{A RESTful API exposing PDFlib TET functionality}
|
12
|
+
s.description = %q{Provides a very simple extraction resource for extracing text from slices of a PDF.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "restet"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
s.add_runtime_dependency 'multipart-post'
|
22
|
+
s.add_runtime_dependency 'yajl-ruby'
|
23
|
+
s.add_runtime_dependency 'sinatra'
|
24
|
+
s.add_runtime_dependency 'vegas'
|
25
|
+
end
|
data/test/client_test.rb
ADDED
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: restet
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Scott Holden
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-12-01 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: multipart-post
|
16
|
+
requirement: &70229104384680 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70229104384680
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: yajl-ruby
|
27
|
+
requirement: &70229104383680 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70229104383680
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: sinatra
|
38
|
+
requirement: &70229104382320 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70229104382320
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: vegas
|
49
|
+
requirement: &70229104381820 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *70229104381820
|
58
|
+
description: Provides a very simple extraction resource for extracing text from slices
|
59
|
+
of a PDF.
|
60
|
+
email:
|
61
|
+
- ssh@sshconnection.com
|
62
|
+
executables:
|
63
|
+
- restet-cli
|
64
|
+
- restet-web
|
65
|
+
extensions: []
|
66
|
+
extra_rdoc_files: []
|
67
|
+
files:
|
68
|
+
- .gitignore
|
69
|
+
- Gemfile
|
70
|
+
- Gemfile.lock
|
71
|
+
- README
|
72
|
+
- Rakefile
|
73
|
+
- bin/restet-cli
|
74
|
+
- bin/restet-web
|
75
|
+
- lib/restet.rb
|
76
|
+
- lib/restet/client.rb
|
77
|
+
- lib/restet/client/extraction_params.rb
|
78
|
+
- lib/restet/server.rb
|
79
|
+
- lib/restet/server/command.rb
|
80
|
+
- lib/restet/version.rb
|
81
|
+
- restet.gemspec
|
82
|
+
- test/client_test.rb
|
83
|
+
- test/test_helper.rb
|
84
|
+
homepage: http://github.com/sholden/restet
|
85
|
+
licenses: []
|
86
|
+
post_install_message:
|
87
|
+
rdoc_options: []
|
88
|
+
require_paths:
|
89
|
+
- lib
|
90
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
91
|
+
none: false
|
92
|
+
requirements:
|
93
|
+
- - ! '>='
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '0'
|
96
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
requirements: []
|
103
|
+
rubyforge_project: restet
|
104
|
+
rubygems_version: 1.8.10
|
105
|
+
signing_key:
|
106
|
+
specification_version: 3
|
107
|
+
summary: A RESTful API exposing PDFlib TET functionality
|
108
|
+
test_files:
|
109
|
+
- test/client_test.rb
|
110
|
+
- test/test_helper.rb
|