restet 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ coverage
6
+ InstalledFiles
7
+ lib/bundler/man
8
+ pkg
9
+ rdoc
10
+ spec/reports
11
+ test/tmp
12
+ test/version_tmp
13
+ tmp
14
+
15
+ # YARD artifacts
16
+ .yardoc
17
+ _yardoc
18
+ doc/
19
+
20
+ # OSX
21
+ .DS_Store
22
+
23
+ # Thumbnails
24
+ ._*
25
+
26
+ # Files that might appear on external disk
27
+ .Spotlight-V100
28
+ .Trashes
29
+
30
+ .redcar
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in restet.gemspec
4
+ gemspec
@@ -0,0 +1,30 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ restet (0.0.1)
5
+ multipart-post
6
+ sinatra
7
+ vegas
8
+ yajl-ruby
9
+
10
+ GEM
11
+ remote: http://rubygems.org/
12
+ specs:
13
+ multipart-post (1.1.4)
14
+ rack (1.3.5)
15
+ rack-protection (1.1.4)
16
+ rack
17
+ sinatra (1.3.1)
18
+ rack (~> 1.3, >= 1.3.4)
19
+ rack-protection (~> 1.1, >= 1.1.2)
20
+ tilt (~> 1.3, >= 1.3.3)
21
+ tilt (1.3.3)
22
+ vegas (0.1.8)
23
+ rack (>= 1.0.0)
24
+ yajl-ruby (1.1.0)
25
+
26
+ PLATFORMS
27
+ ruby
28
+
29
+ DEPENDENCIES
30
+ restet!
data/README ADDED
@@ -0,0 +1,6 @@
1
+ This is currently a very basic way to expose PDFlib TET cli functions.
2
+
3
+ Right now, I only need a small subset of features, but as I need more I will implement additional
4
+ extraction parameters and resources.
5
+
6
+ TET can be found here: http://www.pdflib.com/products/tet/
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task :default => :test
4
+
5
+ require 'rake/testtask'
6
+ Rake::TestTask.new(:test) do |test|
7
+ test.libs << 'test'
8
+ test.pattern = 'test/**/*_test.rb'
9
+ test.verbose = true
10
+ end
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+
5
+ require 'restet'
6
+ require 'restet/client'
7
+ require 'uri'
8
+
9
+ filename = ARGV[0]
10
+
11
+ def get_restet_uri_string
12
+ return ENV['RESTET_URI'] if ENV['RESTET_URI']
13
+ return File.read('~/.restet').strip if File.exist?('~/.restet')
14
+ 'http://localhost:5679/extractions'
15
+ end
16
+
17
+ abort unless File.exist?(filename)
18
+
19
+ File.open(filename) do |file|
20
+ client = Restet::Client.new(URI.parse(get_restet_uri_string))
21
+ extraction = client.extract(file) do |params|
22
+ params.slice :slice, [ARGV[1], ARGV[2]], [ARGV[3], ARGV[4]]
23
+ end
24
+ puts extraction[:slice]
25
+ end
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ begin
5
+ require 'vegas'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'vegas'
9
+ end
10
+
11
+ require 'restet'
12
+ require 'restet/server'
13
+
14
+
15
+ Vegas::Runner.new(Restet::Server, 'restet-web')
@@ -0,0 +1,8 @@
1
+ require 'yajl'
2
+ require 'yajl/json_gem'
3
+ require 'restet/version'
4
+
5
+
6
+ module Restet
7
+
8
+ end
@@ -0,0 +1,25 @@
1
+ require 'net/http'
2
+ require 'net/http/post/multipart'
3
+ require 'restet/client/extraction_params'
4
+
5
+ module Restet
6
+ class Client
7
+ def initialize(uri)
8
+ @uri = uri
9
+ end
10
+
11
+ def extract(file)
12
+ params = ExtractionParams.new(file)
13
+ yield params if block_given?
14
+ Net::HTTP.start(@uri.host, @uri.port) do |http|
15
+ response = http.request build_request(file, params)
16
+ return JSON.parse(response.body, :symbolize_keys => true)
17
+ end
18
+ end
19
+
20
+ def build_request(file, params)
21
+ upload_io = UploadIO.new(file, 'application/pdf')
22
+ Net::HTTP::Post::Multipart.new(@uri.path, :pdf => upload_io, :extraction => params.to_json)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,19 @@
1
+ module Restet
2
+ class Client
3
+ class ExtractionParams
4
+ attr_reader :file, :slices
5
+
6
+ def initialize(file, slices = {})
7
+ @file, @slices = file, slices
8
+ end
9
+
10
+ def slice(name, bottom_left, top_right)
11
+ @slices[name.to_sym] = {:bottom_left => bottom_left, :top_right => top_right}
12
+ end
13
+
14
+ def to_json
15
+ JSON.dump(@slices)
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ require 'sinatra/base'
2
+
3
+ module Restet
4
+ class Server < Sinatra::Base
5
+ require 'restet/server/command'
6
+ dir = File.dirname(File.expand_path(__FILE__))
7
+
8
+ get '/version' do
9
+ Restet::VERSION
10
+ end
11
+
12
+ post '/extractions' do
13
+ slices = JSON.parse(params[:extraction])
14
+ extractions = {}
15
+ slices.each do |name, slice|
16
+ extractions[name] = Command.new(params[:pdf][:tempfile], slice).execute
17
+ end
18
+ JSON.dump(extractions)
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,20 @@
1
+ module Restet
2
+ class Server
3
+ class Command
4
+ def initialize(file, slice)
5
+ puts file.class.name
6
+ @file, @slice = file, slice
7
+ end
8
+
9
+ def pageopt
10
+ "includebox={{#{@slice['bottom_left'].join(' ')} #{@slice['top_right'].join(' ')}}}"
11
+ end
12
+
13
+ def execute
14
+ cmd = "tet --text --pageopt \"#{pageopt}\" --outfile - #{File.realpath(@file)}"
15
+ result = `#{cmd}`
16
+ result.gsub(/PDFlib TET: PDFlib Text Extraction Toolkit, 4.0p2\n\(c\) 2002-2010 PDFlib GmbH www.pdflib.com sales@pdflib.com\n/, '')
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module Restet
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "restet/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "restet"
7
+ s.version = Restet::VERSION
8
+ s.authors = ["Scott Holden"]
9
+ s.email = ["ssh@sshconnection.com"]
10
+ s.homepage = "http://github.com/sholden/restet"
11
+ s.summary = %q{A RESTful API exposing PDFlib TET functionality}
12
+ s.description = %q{Provides a very simple extraction resource for extracing text from slices of a PDF.}
13
+
14
+ s.rubyforge_project = "restet"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_runtime_dependency 'multipart-post'
22
+ s.add_runtime_dependency 'yajl-ruby'
23
+ s.add_runtime_dependency 'sinatra'
24
+ s.add_runtime_dependency 'vegas'
25
+ end
@@ -0,0 +1,6 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'test_helper'))
2
+ require 'restet/client'
3
+
4
+ class ClientTest < Restet::TestCase
5
+
6
+ end
@@ -0,0 +1,12 @@
1
+ dir = File.dirname(File.expand_path(__FILE__))
2
+ $LOAD_PATH.unshift dir + '/../lib'
3
+ $TESTING = true
4
+
5
+ require 'rubygems'
6
+ require 'test/unit'
7
+ require 'restet'
8
+
9
+ module Restet
10
+ class TestCase < Test::Unit::TestCase
11
+ end
12
+ end
metadata ADDED
@@ -0,0 +1,110 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: restet
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Scott Holden
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-12-01 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: multipart-post
16
+ requirement: &70229104384680 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70229104384680
25
+ - !ruby/object:Gem::Dependency
26
+ name: yajl-ruby
27
+ requirement: &70229104383680 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *70229104383680
36
+ - !ruby/object:Gem::Dependency
37
+ name: sinatra
38
+ requirement: &70229104382320 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *70229104382320
47
+ - !ruby/object:Gem::Dependency
48
+ name: vegas
49
+ requirement: &70229104381820 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *70229104381820
58
+ description: Provides a very simple extraction resource for extracing text from slices
59
+ of a PDF.
60
+ email:
61
+ - ssh@sshconnection.com
62
+ executables:
63
+ - restet-cli
64
+ - restet-web
65
+ extensions: []
66
+ extra_rdoc_files: []
67
+ files:
68
+ - .gitignore
69
+ - Gemfile
70
+ - Gemfile.lock
71
+ - README
72
+ - Rakefile
73
+ - bin/restet-cli
74
+ - bin/restet-web
75
+ - lib/restet.rb
76
+ - lib/restet/client.rb
77
+ - lib/restet/client/extraction_params.rb
78
+ - lib/restet/server.rb
79
+ - lib/restet/server/command.rb
80
+ - lib/restet/version.rb
81
+ - restet.gemspec
82
+ - test/client_test.rb
83
+ - test/test_helper.rb
84
+ homepage: http://github.com/sholden/restet
85
+ licenses: []
86
+ post_install_message:
87
+ rdoc_options: []
88
+ require_paths:
89
+ - lib
90
+ required_ruby_version: !ruby/object:Gem::Requirement
91
+ none: false
92
+ requirements:
93
+ - - ! '>='
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ required_rubygems_version: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ requirements: []
103
+ rubyforge_project: restet
104
+ rubygems_version: 1.8.10
105
+ signing_key:
106
+ specification_version: 3
107
+ summary: A RESTful API exposing PDFlib TET functionality
108
+ test_files:
109
+ - test/client_test.rb
110
+ - test/test_helper.rb