pdf-extract 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/assign.rb +72 -0
- data/bin/config.json +4 -0
- data/bin/fac_v19n11_s5.mask.pdf +0 -0
- data/bin/margins.mask.pdf +0 -0
- data/bin/one-column.mask.pdf +24110 -39
- data/bin/pdf-extract +146 -0
- data/bin/s002040050107_Arch_Toxicol_1994_68_8.mask.pdf +0 -0
- data/bin/some3.mask.pdf +0 -0
- data/bin/some5.mask.pdf +0 -0
- data/bin/some6.mask.pdf +0 -0
- data/bin/train.rb +48 -0
- data/bin/two-column.mask.pdf +0 -0
- data/data/familynames.db +0 -0
- data/data/stopwords.txt +1 -0
- data/lib/analysis/columns.rb +75 -0
- data/lib/analysis/margins.rb +84 -0
- data/lib/analysis/sections.rb +156 -0
- data/lib/analysis/titles.rb +53 -0
- data/lib/analysis/zones.rb +128 -0
- data/lib/font_metrics.rb +240 -0
- data/lib/kmeans.rb +114 -0
- data/lib/language.rb +58 -0
- data/lib/model/characters.rb +320 -0
- data/lib/model/chunks.rb +103 -0
- data/lib/model/regions.rb +112 -0
- data/lib/multi_range.rb +69 -0
- data/lib/names.rb +85 -0
- data/lib/pdf-extract.rb +77 -0
- data/lib/pdf.rb +255 -0
- data/lib/references/references.rb +184 -0
- data/lib/references/resolve.rb +113 -0
- data/lib/references/resolved_references.rb +37 -0
- data/lib/spatial.rb +188 -0
- data/lib/view/abstract_view.rb +32 -0
- data/lib/view/pdf_view.rb +43 -0
- data/lib/view/png_view.rb +30 -0
- data/lib/view/xml_view.rb +113 -0
- metadata +208 -0
data/lib/spatial.rb
ADDED
@@ -0,0 +1,188 @@
|
|
1
|
+
|
2
|
+
module PdfExtract
|
3
|
+
module Spatial
|
4
|
+
|
5
|
+
@@default_options = {
|
6
|
+
:separator => '',
|
7
|
+
:lines => false,
|
8
|
+
:write_mode => :left_to_right
|
9
|
+
}
|
10
|
+
|
11
|
+
@@spatial_attribs = [:x, :y, :width, :height, :page_width, :page_height, :page]
|
12
|
+
|
13
|
+
def self.concat_lines top, bottom
|
14
|
+
if top =~ /\-\Z/
|
15
|
+
top[0..-2] + bottom
|
16
|
+
else
|
17
|
+
top + ' ' + bottom
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.drop_spatial obj
|
22
|
+
obj.dup.delete_if { |k, v| @@spatial_attribs.include? k }
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.merge_lines a, b, so
|
26
|
+
so[:lines] = []
|
27
|
+
|
28
|
+
if a.key? :lines
|
29
|
+
so[:lines] += a[:lines]
|
30
|
+
else
|
31
|
+
so[:lines] << as_line(a)
|
32
|
+
end
|
33
|
+
|
34
|
+
if b.key? :lines
|
35
|
+
so[:lines] += b[:lines]
|
36
|
+
else
|
37
|
+
so[:lines] << as_line(b)
|
38
|
+
end
|
39
|
+
|
40
|
+
so
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.merge a, b, options={}
|
44
|
+
options = @@default_options.merge options
|
45
|
+
|
46
|
+
bottom_left = [ [a[:x], b[:x]].min, [a[:y], b[:y]].min ]
|
47
|
+
top_right = [ [a[:x] + a[:width], b[:x] + b[:width]].max,
|
48
|
+
[a[:y] + a[:height], b[:y] + b[:height]].max ]
|
49
|
+
|
50
|
+
so = a.merge(b).merge({
|
51
|
+
:x => bottom_left[0],
|
52
|
+
:y => bottom_left[1],
|
53
|
+
:width => top_right[0] - bottom_left[0],
|
54
|
+
:height => top_right[1] - bottom_left[1]
|
55
|
+
})
|
56
|
+
|
57
|
+
if options[:lines]
|
58
|
+
merge_lines a, b, so
|
59
|
+
else
|
60
|
+
so[:content] = (a[:content] + options[:separator] + b[:content])
|
61
|
+
so[:content] = so[:content].gsub /\s+/, " "
|
62
|
+
end
|
63
|
+
|
64
|
+
if get_text_content(a).length > get_text_content(b).length
|
65
|
+
so[:font] = a[:font]
|
66
|
+
so[:line_height] = a[:line_height]
|
67
|
+
else
|
68
|
+
so[:font] = b[:font]
|
69
|
+
so[:line_height] = b[:line_height]
|
70
|
+
end
|
71
|
+
|
72
|
+
so
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.line_count obj
|
76
|
+
line_count = 0
|
77
|
+
line_count += obj[:content].count("\n") + 1 if obj[:content]
|
78
|
+
line_count += obj[:lines].length if obj[:lines]
|
79
|
+
line_count
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.get_dimensions obj
|
83
|
+
{
|
84
|
+
:x => obj[:x],
|
85
|
+
:y => obj[:y],
|
86
|
+
:width => obj[:width],
|
87
|
+
:height => obj[:height]
|
88
|
+
}
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.as_line obj
|
92
|
+
get_dimensions(obj).merge({:content => obj[:content]})
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.get_text_content obj
|
96
|
+
if obj[:lines]
|
97
|
+
obj[:lines].map do |line|
|
98
|
+
if line[:content] =~ /\-\Z/
|
99
|
+
line[:content][0..-2]
|
100
|
+
else
|
101
|
+
line[:content] + " "
|
102
|
+
end
|
103
|
+
end.join("").strip
|
104
|
+
elsif obj[:content]
|
105
|
+
obj[:content]
|
106
|
+
else
|
107
|
+
obj
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Collapse a list of objects into one. Will merge objects in the
|
112
|
+
# correct write order, specified by write_mode.
|
113
|
+
def self.collapse objs, options={}
|
114
|
+
options = @@default_options.merge options
|
115
|
+
|
116
|
+
sorted = case write_mode
|
117
|
+
when :left_to_right
|
118
|
+
objs.sort_by { |obj| -(obj[:y].floor * 100) + (obj[:x] / 100.0) }
|
119
|
+
end
|
120
|
+
|
121
|
+
if sorted.count == 1
|
122
|
+
sorted.first.dup
|
123
|
+
else
|
124
|
+
o = sorted.delete_at(0).dup
|
125
|
+
while not sorted.count.zero?
|
126
|
+
merge o, sorted.delete_at(0)
|
127
|
+
end
|
128
|
+
o
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def self.contains? a, b
|
133
|
+
a_x1 = a[:x]
|
134
|
+
a_x2 = a[:x] + a[:width]
|
135
|
+
a_y1 = a[:y]
|
136
|
+
a_y2 = a[:y] + a[:height]
|
137
|
+
|
138
|
+
b_x1 = b[:x]
|
139
|
+
b_x2 = b[:x] + b[:width]
|
140
|
+
b_y1 = b[:y]
|
141
|
+
b_y2 = b[:y] + b[:height]
|
142
|
+
|
143
|
+
b_x1 >= a_x1 && b_x2 <= a_x2 && b_y1 >= a_y1 && b_y2 <= a_y2
|
144
|
+
end
|
145
|
+
|
146
|
+
def self.overlap? from, by, a, b
|
147
|
+
a_top = a[from] + a[by]
|
148
|
+
b_top = b[rom] + b[by]
|
149
|
+
|
150
|
+
(b_top <= a_top && b_top >= a[from]) || (b[from] >= a[from] && b[from] <= b_top)
|
151
|
+
end
|
152
|
+
|
153
|
+
def self.score items, ideals
|
154
|
+
types = {}
|
155
|
+
ideals.keys.each do |name|
|
156
|
+
types[name] = ideals[name].keys
|
157
|
+
end
|
158
|
+
|
159
|
+
types.each do |name, vars|
|
160
|
+
score_name = (name.to_s + "_score").to_sym
|
161
|
+
|
162
|
+
vars.each do |var_name|
|
163
|
+
|
164
|
+
scores = []
|
165
|
+
items.each do |item|
|
166
|
+
diff = (item[var_name] - ideals[name][var_name][0]).abs
|
167
|
+
if diff.zero?
|
168
|
+
diff = Float::MIN
|
169
|
+
end
|
170
|
+
scores << 1.0 / diff
|
171
|
+
end
|
172
|
+
|
173
|
+
score_max = scores.max
|
174
|
+
weighted_scores = scores.map do |score|
|
175
|
+
(score / score_max) * ideals[name][var_name][1]
|
176
|
+
end
|
177
|
+
|
178
|
+
items.each_index do |idx|
|
179
|
+
items[idx][score_name] ||= 0.0
|
180
|
+
items[idx][score_name] += weighted_scores[idx]
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
end
|
188
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module PdfExtract
|
2
|
+
class AbstractView
|
3
|
+
|
4
|
+
@@auto_colors = ["ff0000", "00ff00", "0000ff", "ffff00",
|
5
|
+
"ff7f00", "ffc0cb", "800080", "f0e68c",
|
6
|
+
"a52a2a"]
|
7
|
+
|
8
|
+
def initialize pdf, filename
|
9
|
+
@pdf = pdf
|
10
|
+
@filename = filename
|
11
|
+
end
|
12
|
+
|
13
|
+
# Return renderable objects - those whose spatials method was
|
14
|
+
# called explicitly.
|
15
|
+
def objects
|
16
|
+
@pdf.spatial_objects.reject { |type, _| not @pdf.explicit_call? type }
|
17
|
+
end
|
18
|
+
|
19
|
+
def auto_color
|
20
|
+
@next_auto_color = 0 if @next_auto_color.nil?
|
21
|
+
color = @@auto_colors[@next_auto_color]
|
22
|
+
@next_auto_color = @next_auto_color.next
|
23
|
+
color
|
24
|
+
end
|
25
|
+
|
26
|
+
def singular_name name
|
27
|
+
name = name.sub /ies$/, 'y'
|
28
|
+
name = name.sub /s$/, ''
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'prawn'
|
2
|
+
require_relative 'abstract_view'
|
3
|
+
|
4
|
+
module PdfExtract
|
5
|
+
class PdfView < AbstractView
|
6
|
+
|
7
|
+
def render options={}
|
8
|
+
Prawn::Document.new :template => @filename do |doc|
|
9
|
+
objects.each_pair do |type, objs|
|
10
|
+
last_page = 1
|
11
|
+
color = auto_color
|
12
|
+
doc.go_to_page last_page
|
13
|
+
doc.fill_color color
|
14
|
+
|
15
|
+
objs.each do |obj|
|
16
|
+
unless obj[:page].nil?
|
17
|
+
if obj[:page] != last_page
|
18
|
+
last_page = obj[:page]
|
19
|
+
doc.go_to_page last_page
|
20
|
+
doc.fill_color color
|
21
|
+
end
|
22
|
+
|
23
|
+
# XXX Works, but why?
|
24
|
+
pos = [obj[:x] - 36, obj[:y] + obj[:height] - 36]
|
25
|
+
width = obj[:width]
|
26
|
+
height = obj[:height]
|
27
|
+
|
28
|
+
doc.transparent 0.2 do
|
29
|
+
doc.fill_rectangle pos, width, height
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.write render, filename
|
38
|
+
render.render_file filename
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'RMagick'
|
2
|
+
|
3
|
+
require_relative 'abstract_view'
|
4
|
+
|
5
|
+
module PdfExtract
|
6
|
+
class PngView < AbstractView
|
7
|
+
|
8
|
+
def render options={}
|
9
|
+
img = Magick::Image.new(800, 1000) { self.background_color = "white" }
|
10
|
+
|
11
|
+
objects.each_pair do |type, objs|
|
12
|
+
color = auto_color
|
13
|
+
objs.each do |obj|
|
14
|
+
gc = Magick::Draw.new
|
15
|
+
gc.fill = "\##{color}"
|
16
|
+
gc.rectangle(obj[:x], obj[:y], obj[:x] + obj[:width],
|
17
|
+
obj[:y] + obj[:height])
|
18
|
+
gc.draw img
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
img
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.write render, filename
|
26
|
+
render.write filename
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
require_relative 'abstract_view'
|
4
|
+
require_relative '../language'
|
5
|
+
|
6
|
+
module PdfExtract
|
7
|
+
class XmlView < AbstractView
|
8
|
+
|
9
|
+
@@ignored_attributes = [:content, :page, :page_width, :page_height]
|
10
|
+
|
11
|
+
@@numeric_attributes = [:x, :y, :width, :height, :line_height,
|
12
|
+
:page_height, :page_width, :x_offset, :y_offset,
|
13
|
+
:spacing, :letter_ratio, :cap_ratio, :year_ratio]
|
14
|
+
|
15
|
+
# Return renderable attributes
|
16
|
+
def get_xml_attributes obj
|
17
|
+
attribs = obj.reject { |k, _| @@ignored_attributes.include? k }
|
18
|
+
attribs = attribs.reject { |_, v| v.kind_of?(Hash) || v.kind_of?(Array) }
|
19
|
+
attribs.each_pair do |k, v|
|
20
|
+
if @@numeric_attributes.include?(k) || k.to_s =~ /.+_score/
|
21
|
+
attribs[k] = v.round(@render_options[:round])
|
22
|
+
end
|
23
|
+
end
|
24
|
+
attribs
|
25
|
+
end
|
26
|
+
|
27
|
+
def get_nested_objs obj
|
28
|
+
nested = obj.reject { |_, v| ! (v.kind_of?(Hash) || v.kind_of?(Array)) }
|
29
|
+
if @render_options[:lines]
|
30
|
+
nested
|
31
|
+
else
|
32
|
+
nested.reject { |k, _| k == :lines }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def render options={}
|
37
|
+
@render_options = {:lines => true, :round => 2, :outline => false}.merge(options)
|
38
|
+
|
39
|
+
pages = {}
|
40
|
+
page_params = {}
|
41
|
+
pageless = {}
|
42
|
+
|
43
|
+
objects.each_pair do |type, objs|
|
44
|
+
objs.each do |obj|
|
45
|
+
if obj.key? :page
|
46
|
+
pages[obj[:page]] ||= {}
|
47
|
+
pages[obj[:page]][type] ||= []
|
48
|
+
|
49
|
+
pages[obj[:page]][type] << obj
|
50
|
+
|
51
|
+
page_params[obj[:page]] ||= {
|
52
|
+
:width => obj[:page_width],
|
53
|
+
:height => obj[:page_height],
|
54
|
+
:number => obj[:page]
|
55
|
+
}
|
56
|
+
else
|
57
|
+
pageless[type] ||= []
|
58
|
+
pageless[type] << obj
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
builder = Nokogiri::XML::Builder.new do |xml|
|
64
|
+
xml.pdf {
|
65
|
+
pageless.each_pair do |type, objs|
|
66
|
+
objs.each do |obj| write_obj_to_xml obj, type, xml end
|
67
|
+
end
|
68
|
+
|
69
|
+
pages.each_pair do |page_number, obj_types|
|
70
|
+
xml.page(page_params[page_number]) {
|
71
|
+
obj_types.each_pair do |type, objs|
|
72
|
+
objs.each do |obj| write_obj_to_xml obj, type, xml end
|
73
|
+
end
|
74
|
+
}
|
75
|
+
end
|
76
|
+
}
|
77
|
+
end
|
78
|
+
|
79
|
+
builder.to_xml
|
80
|
+
end
|
81
|
+
|
82
|
+
def write_obj_to_xml obj, type, xml
|
83
|
+
xml.send singular_name(type.to_s), get_xml_attributes(obj) do
|
84
|
+
|
85
|
+
unless @render_options[:outline]
|
86
|
+
if not @render_options[:lines]
|
87
|
+
xml.text Language::transliterate(Spatial.get_text_content obj)
|
88
|
+
elsif obj.key?(:content)
|
89
|
+
xml.text Language::transliterate(obj[:content].to_s)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
get_nested_objs(obj).each do |name, nested_obj|
|
94
|
+
element_name = singular_name name.to_s
|
95
|
+
if nested_obj.kind_of? Hash
|
96
|
+
write_obj_to_xml nested_obj, element_name, xml
|
97
|
+
elsif nested_obj.kind_of? Array
|
98
|
+
nested_obj.each do |item|
|
99
|
+
write_obj_to_xml item, element_name, xml
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def self.write render, filename
|
107
|
+
File.open filename, "w" do |file|
|
108
|
+
file.write render
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
end
|
metadata
ADDED
@@ -0,0 +1,208 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pdf-extract
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Karl Jonathan Ward
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-10-21 00:00:00 +01:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: pdf-reader
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - "="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 1
|
30
|
+
- 0
|
31
|
+
- 0
|
32
|
+
- beta1
|
33
|
+
version: 1.0.0.beta1
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: nokogiri
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
segments:
|
45
|
+
- 1
|
46
|
+
- 5
|
47
|
+
- 0
|
48
|
+
version: 1.5.0
|
49
|
+
type: :runtime
|
50
|
+
version_requirements: *id002
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
name: rmagick
|
53
|
+
prerelease: false
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
segments:
|
60
|
+
- 2
|
61
|
+
- 13
|
62
|
+
- 1
|
63
|
+
version: 2.13.1
|
64
|
+
type: :runtime
|
65
|
+
version_requirements: *id003
|
66
|
+
- !ruby/object:Gem::Dependency
|
67
|
+
name: prawn
|
68
|
+
prerelease: false
|
69
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
segments:
|
75
|
+
- 0
|
76
|
+
- 11
|
77
|
+
- 1
|
78
|
+
version: 0.11.1
|
79
|
+
type: :runtime
|
80
|
+
version_requirements: *id004
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
name: sqlite3
|
83
|
+
prerelease: false
|
84
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
85
|
+
none: false
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
segments:
|
90
|
+
- 1
|
91
|
+
- 3
|
92
|
+
- 4
|
93
|
+
version: 1.3.4
|
94
|
+
type: :runtime
|
95
|
+
version_requirements: *id005
|
96
|
+
- !ruby/object:Gem::Dependency
|
97
|
+
name: commander
|
98
|
+
prerelease: false
|
99
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
100
|
+
none: false
|
101
|
+
requirements:
|
102
|
+
- - ">="
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
segments:
|
105
|
+
- 4
|
106
|
+
- 0
|
107
|
+
- 4
|
108
|
+
version: 4.0.4
|
109
|
+
type: :runtime
|
110
|
+
version_requirements: *id006
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: json
|
113
|
+
prerelease: false
|
114
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
115
|
+
none: false
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
segments:
|
120
|
+
- 1
|
121
|
+
- 5
|
122
|
+
- 1
|
123
|
+
version: 1.5.1
|
124
|
+
type: :runtime
|
125
|
+
version_requirements: *id007
|
126
|
+
description:
|
127
|
+
email:
|
128
|
+
- kward@crossref.org
|
129
|
+
executables:
|
130
|
+
- pdf-extract
|
131
|
+
extensions: []
|
132
|
+
|
133
|
+
extra_rdoc_files: []
|
134
|
+
|
135
|
+
files:
|
136
|
+
- bin/assign.rb
|
137
|
+
- bin/config.json
|
138
|
+
- bin/fac_v19n11_s5.mask.pdf
|
139
|
+
- bin/margins.mask.pdf
|
140
|
+
- bin/one-column.mask.pdf
|
141
|
+
- bin/pdf-extract
|
142
|
+
- bin/s002040050107_Arch_Toxicol_1994_68_8.mask.pdf
|
143
|
+
- bin/some3.mask.pdf
|
144
|
+
- bin/some5.mask.pdf
|
145
|
+
- bin/some6.mask.pdf
|
146
|
+
- bin/train.rb
|
147
|
+
- bin/two-column.mask.pdf
|
148
|
+
- lib/analysis/columns.rb
|
149
|
+
- lib/analysis/margins.rb
|
150
|
+
- lib/analysis/sections.rb
|
151
|
+
- lib/analysis/titles.rb
|
152
|
+
- lib/analysis/zones.rb
|
153
|
+
- lib/font_metrics.rb
|
154
|
+
- lib/kmeans.rb
|
155
|
+
- lib/language.rb
|
156
|
+
- lib/model/characters.rb
|
157
|
+
- lib/model/chunks.rb
|
158
|
+
- lib/model/regions.rb
|
159
|
+
- lib/multi_range.rb
|
160
|
+
- lib/names.rb
|
161
|
+
- lib/pdf-extract.rb
|
162
|
+
- lib/pdf.rb
|
163
|
+
- lib/references/references.rb
|
164
|
+
- lib/references/resolve.rb
|
165
|
+
- lib/references/resolved_references.rb
|
166
|
+
- lib/spatial.rb
|
167
|
+
- lib/view/abstract_view.rb
|
168
|
+
- lib/view/pdf_view.rb
|
169
|
+
- lib/view/png_view.rb
|
170
|
+
- lib/view/xml_view.rb
|
171
|
+
- data/familynames.db
|
172
|
+
- data/stopwords.txt
|
173
|
+
has_rdoc: true
|
174
|
+
homepage: http://github.com/CrossRef/pdfextract
|
175
|
+
licenses: []
|
176
|
+
|
177
|
+
post_install_message:
|
178
|
+
rdoc_options: []
|
179
|
+
|
180
|
+
require_paths:
|
181
|
+
- lib
|
182
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
183
|
+
none: false
|
184
|
+
requirements:
|
185
|
+
- - ">="
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
segments:
|
188
|
+
- 1
|
189
|
+
- 9
|
190
|
+
- 1
|
191
|
+
version: 1.9.1
|
192
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
193
|
+
none: false
|
194
|
+
requirements:
|
195
|
+
- - ">="
|
196
|
+
- !ruby/object:Gem::Version
|
197
|
+
segments:
|
198
|
+
- 0
|
199
|
+
version: "0"
|
200
|
+
requirements: []
|
201
|
+
|
202
|
+
rubyforge_project:
|
203
|
+
rubygems_version: 1.3.7
|
204
|
+
signing_key:
|
205
|
+
specification_version: 3
|
206
|
+
summary: PDF content extraction tool and library.
|
207
|
+
test_files: []
|
208
|
+
|