restet 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +30 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +30 -0
- data/README +6 -0
- data/Rakefile +10 -0
- data/bin/restet-cli +25 -0
- data/bin/restet-web +15 -0
- data/lib/restet.rb +8 -0
- data/lib/restet/client.rb +25 -0
- data/lib/restet/client/extraction_params.rb +19 -0
- data/lib/restet/server.rb +21 -0
- data/lib/restet/server/command.rb +20 -0
- data/lib/restet/version.rb +3 -0
- data/restet.gemspec +25 -0
- data/test/client_test.rb +6 -0
- data/test/test_helper.rb +12 -0
- metadata +110 -0
data/.gitignore
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
coverage
|
6
|
+
InstalledFiles
|
7
|
+
lib/bundler/man
|
8
|
+
pkg
|
9
|
+
rdoc
|
10
|
+
spec/reports
|
11
|
+
test/tmp
|
12
|
+
test/version_tmp
|
13
|
+
tmp
|
14
|
+
|
15
|
+
# YARD artifacts
|
16
|
+
.yardoc
|
17
|
+
_yardoc
|
18
|
+
doc/
|
19
|
+
|
20
|
+
# OSX
|
21
|
+
.DS_Store
|
22
|
+
|
23
|
+
# Thumbnails
|
24
|
+
._*
|
25
|
+
|
26
|
+
# Files that might appear on external disk
|
27
|
+
.Spotlight-V100
|
28
|
+
.Trashes
|
29
|
+
|
30
|
+
.redcar
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
restet (0.0.1)
|
5
|
+
multipart-post
|
6
|
+
sinatra
|
7
|
+
vegas
|
8
|
+
yajl-ruby
|
9
|
+
|
10
|
+
GEM
|
11
|
+
remote: http://rubygems.org/
|
12
|
+
specs:
|
13
|
+
multipart-post (1.1.4)
|
14
|
+
rack (1.3.5)
|
15
|
+
rack-protection (1.1.4)
|
16
|
+
rack
|
17
|
+
sinatra (1.3.1)
|
18
|
+
rack (~> 1.3, >= 1.3.4)
|
19
|
+
rack-protection (~> 1.1, >= 1.1.2)
|
20
|
+
tilt (~> 1.3, >= 1.3.3)
|
21
|
+
tilt (1.3.3)
|
22
|
+
vegas (0.1.8)
|
23
|
+
rack (>= 1.0.0)
|
24
|
+
yajl-ruby (1.1.0)
|
25
|
+
|
26
|
+
PLATFORMS
|
27
|
+
ruby
|
28
|
+
|
29
|
+
DEPENDENCIES
|
30
|
+
restet!
|
data/README
ADDED
@@ -0,0 +1,6 @@
|
|
1
|
+
This is currently a very basic way to expose PDFlib TET cli functions.
|
2
|
+
|
3
|
+
Right now, I only need a small subset of features, but as I need more I will implement additional
|
4
|
+
extraction parameters and resources.
|
5
|
+
|
6
|
+
TET can be found here: http://www.pdflib.com/products/tet/
|
data/Rakefile
ADDED
data/bin/restet-cli
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
|
4
|
+
|
5
|
+
require 'restet'
|
6
|
+
require 'restet/client'
|
7
|
+
require 'uri'
|
8
|
+
|
9
|
+
filename = ARGV[0]
|
10
|
+
|
11
|
+
def get_restet_uri_string
|
12
|
+
return ENV['RESTET_URI'] if ENV['RESTET_URI']
|
13
|
+
return File.read('~/.restet').strip if File.exist?('~/.restet')
|
14
|
+
'http://localhost:5679/extractions'
|
15
|
+
end
|
16
|
+
|
17
|
+
abort unless File.exist?(filename)
|
18
|
+
|
19
|
+
File.open(filename) do |file|
|
20
|
+
client = Restet::Client.new(URI.parse(get_restet_uri_string))
|
21
|
+
extraction = client.extract(file) do |params|
|
22
|
+
params.slice :slice, [ARGV[1], ARGV[2]], [ARGV[3], ARGV[4]]
|
23
|
+
end
|
24
|
+
puts extraction[:slice]
|
25
|
+
end
|
data/bin/restet-web
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
|
4
|
+
begin
|
5
|
+
require 'vegas'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'vegas'
|
9
|
+
end
|
10
|
+
|
11
|
+
require 'restet'
|
12
|
+
require 'restet/server'
|
13
|
+
|
14
|
+
|
15
|
+
Vegas::Runner.new(Restet::Server, 'restet-web')
|
data/lib/restet.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'net/http/post/multipart'
|
3
|
+
require 'restet/client/extraction_params'
|
4
|
+
|
5
|
+
module Restet
|
6
|
+
class Client
|
7
|
+
def initialize(uri)
|
8
|
+
@uri = uri
|
9
|
+
end
|
10
|
+
|
11
|
+
def extract(file)
|
12
|
+
params = ExtractionParams.new(file)
|
13
|
+
yield params if block_given?
|
14
|
+
Net::HTTP.start(@uri.host, @uri.port) do |http|
|
15
|
+
response = http.request build_request(file, params)
|
16
|
+
return JSON.parse(response.body, :symbolize_keys => true)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def build_request(file, params)
|
21
|
+
upload_io = UploadIO.new(file, 'application/pdf')
|
22
|
+
Net::HTTP::Post::Multipart.new(@uri.path, :pdf => upload_io, :extraction => params.to_json)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Restet
|
2
|
+
class Client
|
3
|
+
class ExtractionParams
|
4
|
+
attr_reader :file, :slices
|
5
|
+
|
6
|
+
def initialize(file, slices = {})
|
7
|
+
@file, @slices = file, slices
|
8
|
+
end
|
9
|
+
|
10
|
+
def slice(name, bottom_left, top_right)
|
11
|
+
@slices[name.to_sym] = {:bottom_left => bottom_left, :top_right => top_right}
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_json
|
15
|
+
JSON.dump(@slices)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'sinatra/base'
|
2
|
+
|
3
|
+
module Restet
|
4
|
+
class Server < Sinatra::Base
|
5
|
+
require 'restet/server/command'
|
6
|
+
dir = File.dirname(File.expand_path(__FILE__))
|
7
|
+
|
8
|
+
get '/version' do
|
9
|
+
Restet::VERSION
|
10
|
+
end
|
11
|
+
|
12
|
+
post '/extractions' do
|
13
|
+
slices = JSON.parse(params[:extraction])
|
14
|
+
extractions = {}
|
15
|
+
slices.each do |name, slice|
|
16
|
+
extractions[name] = Command.new(params[:pdf][:tempfile], slice).execute
|
17
|
+
end
|
18
|
+
JSON.dump(extractions)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Restet
|
2
|
+
class Server
|
3
|
+
class Command
|
4
|
+
def initialize(file, slice)
|
5
|
+
puts file.class.name
|
6
|
+
@file, @slice = file, slice
|
7
|
+
end
|
8
|
+
|
9
|
+
def pageopt
|
10
|
+
"includebox={{#{@slice['bottom_left'].join(' ')} #{@slice['top_right'].join(' ')}}}"
|
11
|
+
end
|
12
|
+
|
13
|
+
def execute
|
14
|
+
cmd = "tet --text --pageopt \"#{pageopt}\" --outfile - #{File.realpath(@file)}"
|
15
|
+
result = `#{cmd}`
|
16
|
+
result.gsub(/PDFlib TET: PDFlib Text Extraction Toolkit, 4.0p2\n\(c\) 2002-2010 PDFlib GmbH www.pdflib.com sales@pdflib.com\n/, '')
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/restet.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "restet/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "restet"
|
7
|
+
s.version = Restet::VERSION
|
8
|
+
s.authors = ["Scott Holden"]
|
9
|
+
s.email = ["ssh@sshconnection.com"]
|
10
|
+
s.homepage = "http://github.com/sholden/restet"
|
11
|
+
s.summary = %q{A RESTful API exposing PDFlib TET functionality}
|
12
|
+
s.description = %q{Provides a very simple extraction resource for extracing text from slices of a PDF.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "restet"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
s.add_runtime_dependency 'multipart-post'
|
22
|
+
s.add_runtime_dependency 'yajl-ruby'
|
23
|
+
s.add_runtime_dependency 'sinatra'
|
24
|
+
s.add_runtime_dependency 'vegas'
|
25
|
+
end
|
data/test/client_test.rb
ADDED
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: restet
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Scott Holden
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-12-01 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: multipart-post
|
16
|
+
requirement: &70229104384680 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70229104384680
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: yajl-ruby
|
27
|
+
requirement: &70229104383680 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70229104383680
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: sinatra
|
38
|
+
requirement: &70229104382320 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70229104382320
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: vegas
|
49
|
+
requirement: &70229104381820 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *70229104381820
|
58
|
+
description: Provides a very simple extraction resource for extracing text from slices
|
59
|
+
of a PDF.
|
60
|
+
email:
|
61
|
+
- ssh@sshconnection.com
|
62
|
+
executables:
|
63
|
+
- restet-cli
|
64
|
+
- restet-web
|
65
|
+
extensions: []
|
66
|
+
extra_rdoc_files: []
|
67
|
+
files:
|
68
|
+
- .gitignore
|
69
|
+
- Gemfile
|
70
|
+
- Gemfile.lock
|
71
|
+
- README
|
72
|
+
- Rakefile
|
73
|
+
- bin/restet-cli
|
74
|
+
- bin/restet-web
|
75
|
+
- lib/restet.rb
|
76
|
+
- lib/restet/client.rb
|
77
|
+
- lib/restet/client/extraction_params.rb
|
78
|
+
- lib/restet/server.rb
|
79
|
+
- lib/restet/server/command.rb
|
80
|
+
- lib/restet/version.rb
|
81
|
+
- restet.gemspec
|
82
|
+
- test/client_test.rb
|
83
|
+
- test/test_helper.rb
|
84
|
+
homepage: http://github.com/sholden/restet
|
85
|
+
licenses: []
|
86
|
+
post_install_message:
|
87
|
+
rdoc_options: []
|
88
|
+
require_paths:
|
89
|
+
- lib
|
90
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
91
|
+
none: false
|
92
|
+
requirements:
|
93
|
+
- - ! '>='
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '0'
|
96
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
requirements: []
|
103
|
+
rubyforge_project: restet
|
104
|
+
rubygems_version: 1.8.10
|
105
|
+
signing_key:
|
106
|
+
specification_version: 3
|
107
|
+
summary: A RESTful API exposing PDFlib TET functionality
|
108
|
+
test_files:
|
109
|
+
- test/client_test.rb
|
110
|
+
- test/test_helper.rb
|