pdf-extract 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/assign.rb +72 -0
- data/bin/config.json +4 -0
- data/bin/fac_v19n11_s5.mask.pdf +0 -0
- data/bin/margins.mask.pdf +0 -0
- data/bin/one-column.mask.pdf +24110 -39
- data/bin/pdf-extract +146 -0
- data/bin/s002040050107_Arch_Toxicol_1994_68_8.mask.pdf +0 -0
- data/bin/some3.mask.pdf +0 -0
- data/bin/some5.mask.pdf +0 -0
- data/bin/some6.mask.pdf +0 -0
- data/bin/train.rb +48 -0
- data/bin/two-column.mask.pdf +0 -0
- data/data/familynames.db +0 -0
- data/data/stopwords.txt +1 -0
- data/lib/analysis/columns.rb +75 -0
- data/lib/analysis/margins.rb +84 -0
- data/lib/analysis/sections.rb +156 -0
- data/lib/analysis/titles.rb +53 -0
- data/lib/analysis/zones.rb +128 -0
- data/lib/font_metrics.rb +240 -0
- data/lib/kmeans.rb +114 -0
- data/lib/language.rb +58 -0
- data/lib/model/characters.rb +320 -0
- data/lib/model/chunks.rb +103 -0
- data/lib/model/regions.rb +112 -0
- data/lib/multi_range.rb +69 -0
- data/lib/names.rb +85 -0
- data/lib/pdf-extract.rb +77 -0
- data/lib/pdf.rb +255 -0
- data/lib/references/references.rb +184 -0
- data/lib/references/resolve.rb +113 -0
- data/lib/references/resolved_references.rb +37 -0
- data/lib/spatial.rb +188 -0
- data/lib/view/abstract_view.rb +32 -0
- data/lib/view/pdf_view.rb +43 -0
- data/lib/view/png_view.rb +30 -0
- data/lib/view/xml_view.rb +113 -0
- metadata +208 -0
data/lib/spatial.rb
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
|
|
2
|
+
module PdfExtract
|
|
3
|
+
module Spatial
|
|
4
|
+
|
|
5
|
+
@@default_options = {
|
|
6
|
+
:separator => '',
|
|
7
|
+
:lines => false,
|
|
8
|
+
:write_mode => :left_to_right
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
@@spatial_attribs = [:x, :y, :width, :height, :page_width, :page_height, :page]
|
|
12
|
+
|
|
13
|
+
def self.concat_lines top, bottom
|
|
14
|
+
if top =~ /\-\Z/
|
|
15
|
+
top[0..-2] + bottom
|
|
16
|
+
else
|
|
17
|
+
top + ' ' + bottom
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def self.drop_spatial obj
|
|
22
|
+
obj.dup.delete_if { |k, v| @@spatial_attribs.include? k }
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def self.merge_lines a, b, so
|
|
26
|
+
so[:lines] = []
|
|
27
|
+
|
|
28
|
+
if a.key? :lines
|
|
29
|
+
so[:lines] += a[:lines]
|
|
30
|
+
else
|
|
31
|
+
so[:lines] << as_line(a)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
if b.key? :lines
|
|
35
|
+
so[:lines] += b[:lines]
|
|
36
|
+
else
|
|
37
|
+
so[:lines] << as_line(b)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
so
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def self.merge a, b, options={}
|
|
44
|
+
options = @@default_options.merge options
|
|
45
|
+
|
|
46
|
+
bottom_left = [ [a[:x], b[:x]].min, [a[:y], b[:y]].min ]
|
|
47
|
+
top_right = [ [a[:x] + a[:width], b[:x] + b[:width]].max,
|
|
48
|
+
[a[:y] + a[:height], b[:y] + b[:height]].max ]
|
|
49
|
+
|
|
50
|
+
so = a.merge(b).merge({
|
|
51
|
+
:x => bottom_left[0],
|
|
52
|
+
:y => bottom_left[1],
|
|
53
|
+
:width => top_right[0] - bottom_left[0],
|
|
54
|
+
:height => top_right[1] - bottom_left[1]
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
if options[:lines]
|
|
58
|
+
merge_lines a, b, so
|
|
59
|
+
else
|
|
60
|
+
so[:content] = (a[:content] + options[:separator] + b[:content])
|
|
61
|
+
so[:content] = so[:content].gsub /\s+/, " "
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
if get_text_content(a).length > get_text_content(b).length
|
|
65
|
+
so[:font] = a[:font]
|
|
66
|
+
so[:line_height] = a[:line_height]
|
|
67
|
+
else
|
|
68
|
+
so[:font] = b[:font]
|
|
69
|
+
so[:line_height] = b[:line_height]
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
so
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def self.line_count obj
|
|
76
|
+
line_count = 0
|
|
77
|
+
line_count += obj[:content].count("\n") + 1 if obj[:content]
|
|
78
|
+
line_count += obj[:lines].length if obj[:lines]
|
|
79
|
+
line_count
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def self.get_dimensions obj
|
|
83
|
+
{
|
|
84
|
+
:x => obj[:x],
|
|
85
|
+
:y => obj[:y],
|
|
86
|
+
:width => obj[:width],
|
|
87
|
+
:height => obj[:height]
|
|
88
|
+
}
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def self.as_line obj
|
|
92
|
+
get_dimensions(obj).merge({:content => obj[:content]})
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def self.get_text_content obj
|
|
96
|
+
if obj[:lines]
|
|
97
|
+
obj[:lines].map do |line|
|
|
98
|
+
if line[:content] =~ /\-\Z/
|
|
99
|
+
line[:content][0..-2]
|
|
100
|
+
else
|
|
101
|
+
line[:content] + " "
|
|
102
|
+
end
|
|
103
|
+
end.join("").strip
|
|
104
|
+
elsif obj[:content]
|
|
105
|
+
obj[:content]
|
|
106
|
+
else
|
|
107
|
+
obj
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Collapse a list of objects into one. Will merge objects in the
|
|
112
|
+
# correct write order, specified by write_mode.
|
|
113
|
+
def self.collapse objs, options={}
|
|
114
|
+
options = @@default_options.merge options
|
|
115
|
+
|
|
116
|
+
sorted = case write_mode
|
|
117
|
+
when :left_to_right
|
|
118
|
+
objs.sort_by { |obj| -(obj[:y].floor * 100) + (obj[:x] / 100.0) }
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
if sorted.count == 1
|
|
122
|
+
sorted.first.dup
|
|
123
|
+
else
|
|
124
|
+
o = sorted.delete_at(0).dup
|
|
125
|
+
while not sorted.count.zero?
|
|
126
|
+
merge o, sorted.delete_at(0)
|
|
127
|
+
end
|
|
128
|
+
o
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def self.contains? a, b
|
|
133
|
+
a_x1 = a[:x]
|
|
134
|
+
a_x2 = a[:x] + a[:width]
|
|
135
|
+
a_y1 = a[:y]
|
|
136
|
+
a_y2 = a[:y] + a[:height]
|
|
137
|
+
|
|
138
|
+
b_x1 = b[:x]
|
|
139
|
+
b_x2 = b[:x] + b[:width]
|
|
140
|
+
b_y1 = b[:y]
|
|
141
|
+
b_y2 = b[:y] + b[:height]
|
|
142
|
+
|
|
143
|
+
b_x1 >= a_x1 && b_x2 <= a_x2 && b_y1 >= a_y1 && b_y2 <= a_y2
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def self.overlap? from, by, a, b
|
|
147
|
+
a_top = a[from] + a[by]
|
|
148
|
+
b_top = b[rom] + b[by]
|
|
149
|
+
|
|
150
|
+
(b_top <= a_top && b_top >= a[from]) || (b[from] >= a[from] && b[from] <= b_top)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def self.score items, ideals
|
|
154
|
+
types = {}
|
|
155
|
+
ideals.keys.each do |name|
|
|
156
|
+
types[name] = ideals[name].keys
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
types.each do |name, vars|
|
|
160
|
+
score_name = (name.to_s + "_score").to_sym
|
|
161
|
+
|
|
162
|
+
vars.each do |var_name|
|
|
163
|
+
|
|
164
|
+
scores = []
|
|
165
|
+
items.each do |item|
|
|
166
|
+
diff = (item[var_name] - ideals[name][var_name][0]).abs
|
|
167
|
+
if diff.zero?
|
|
168
|
+
diff = Float::MIN
|
|
169
|
+
end
|
|
170
|
+
scores << 1.0 / diff
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
score_max = scores.max
|
|
174
|
+
weighted_scores = scores.map do |score|
|
|
175
|
+
(score / score_max) * ideals[name][var_name][1]
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
items.each_index do |idx|
|
|
179
|
+
items[idx][score_name] ||= 0.0
|
|
180
|
+
items[idx][score_name] += weighted_scores[idx]
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
end
|
|
188
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
module PdfExtract
|
|
2
|
+
class AbstractView
|
|
3
|
+
|
|
4
|
+
@@auto_colors = ["ff0000", "00ff00", "0000ff", "ffff00",
|
|
5
|
+
"ff7f00", "ffc0cb", "800080", "f0e68c",
|
|
6
|
+
"a52a2a"]
|
|
7
|
+
|
|
8
|
+
def initialize pdf, filename
|
|
9
|
+
@pdf = pdf
|
|
10
|
+
@filename = filename
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# Return renderable objects - those whose spatials method was
|
|
14
|
+
# called explicitly.
|
|
15
|
+
def objects
|
|
16
|
+
@pdf.spatial_objects.reject { |type, _| not @pdf.explicit_call? type }
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def auto_color
|
|
20
|
+
@next_auto_color = 0 if @next_auto_color.nil?
|
|
21
|
+
color = @@auto_colors[@next_auto_color]
|
|
22
|
+
@next_auto_color = @next_auto_color.next
|
|
23
|
+
color
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def singular_name name
|
|
27
|
+
name = name.sub /ies$/, 'y'
|
|
28
|
+
name = name.sub /s$/, ''
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
require 'prawn'
|
|
2
|
+
require_relative 'abstract_view'
|
|
3
|
+
|
|
4
|
+
module PdfExtract
|
|
5
|
+
class PdfView < AbstractView
|
|
6
|
+
|
|
7
|
+
def render options={}
|
|
8
|
+
Prawn::Document.new :template => @filename do |doc|
|
|
9
|
+
objects.each_pair do |type, objs|
|
|
10
|
+
last_page = 1
|
|
11
|
+
color = auto_color
|
|
12
|
+
doc.go_to_page last_page
|
|
13
|
+
doc.fill_color color
|
|
14
|
+
|
|
15
|
+
objs.each do |obj|
|
|
16
|
+
unless obj[:page].nil?
|
|
17
|
+
if obj[:page] != last_page
|
|
18
|
+
last_page = obj[:page]
|
|
19
|
+
doc.go_to_page last_page
|
|
20
|
+
doc.fill_color color
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# XXX Works, but why?
|
|
24
|
+
pos = [obj[:x] - 36, obj[:y] + obj[:height] - 36]
|
|
25
|
+
width = obj[:width]
|
|
26
|
+
height = obj[:height]
|
|
27
|
+
|
|
28
|
+
doc.transparent 0.2 do
|
|
29
|
+
doc.fill_rectangle pos, width, height
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def self.write render, filename
|
|
38
|
+
render.render_file filename
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'RMagick'
|
|
2
|
+
|
|
3
|
+
require_relative 'abstract_view'
|
|
4
|
+
|
|
5
|
+
module PdfExtract
|
|
6
|
+
class PngView < AbstractView
|
|
7
|
+
|
|
8
|
+
def render options={}
|
|
9
|
+
img = Magick::Image.new(800, 1000) { self.background_color = "white" }
|
|
10
|
+
|
|
11
|
+
objects.each_pair do |type, objs|
|
|
12
|
+
color = auto_color
|
|
13
|
+
objs.each do |obj|
|
|
14
|
+
gc = Magick::Draw.new
|
|
15
|
+
gc.fill = "\##{color}"
|
|
16
|
+
gc.rectangle(obj[:x], obj[:y], obj[:x] + obj[:width],
|
|
17
|
+
obj[:y] + obj[:height])
|
|
18
|
+
gc.draw img
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
img
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def self.write render, filename
|
|
26
|
+
render.write filename
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
|
|
3
|
+
require_relative 'abstract_view'
|
|
4
|
+
require_relative '../language'
|
|
5
|
+
|
|
6
|
+
module PdfExtract
|
|
7
|
+
class XmlView < AbstractView
|
|
8
|
+
|
|
9
|
+
@@ignored_attributes = [:content, :page, :page_width, :page_height]
|
|
10
|
+
|
|
11
|
+
@@numeric_attributes = [:x, :y, :width, :height, :line_height,
|
|
12
|
+
:page_height, :page_width, :x_offset, :y_offset,
|
|
13
|
+
:spacing, :letter_ratio, :cap_ratio, :year_ratio]
|
|
14
|
+
|
|
15
|
+
# Return renderable attributes
|
|
16
|
+
def get_xml_attributes obj
|
|
17
|
+
attribs = obj.reject { |k, _| @@ignored_attributes.include? k }
|
|
18
|
+
attribs = attribs.reject { |_, v| v.kind_of?(Hash) || v.kind_of?(Array) }
|
|
19
|
+
attribs.each_pair do |k, v|
|
|
20
|
+
if @@numeric_attributes.include?(k) || k.to_s =~ /.+_score/
|
|
21
|
+
attribs[k] = v.round(@render_options[:round])
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
attribs
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def get_nested_objs obj
|
|
28
|
+
nested = obj.reject { |_, v| ! (v.kind_of?(Hash) || v.kind_of?(Array)) }
|
|
29
|
+
if @render_options[:lines]
|
|
30
|
+
nested
|
|
31
|
+
else
|
|
32
|
+
nested.reject { |k, _| k == :lines }
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def render options={}
|
|
37
|
+
@render_options = {:lines => true, :round => 2, :outline => false}.merge(options)
|
|
38
|
+
|
|
39
|
+
pages = {}
|
|
40
|
+
page_params = {}
|
|
41
|
+
pageless = {}
|
|
42
|
+
|
|
43
|
+
objects.each_pair do |type, objs|
|
|
44
|
+
objs.each do |obj|
|
|
45
|
+
if obj.key? :page
|
|
46
|
+
pages[obj[:page]] ||= {}
|
|
47
|
+
pages[obj[:page]][type] ||= []
|
|
48
|
+
|
|
49
|
+
pages[obj[:page]][type] << obj
|
|
50
|
+
|
|
51
|
+
page_params[obj[:page]] ||= {
|
|
52
|
+
:width => obj[:page_width],
|
|
53
|
+
:height => obj[:page_height],
|
|
54
|
+
:number => obj[:page]
|
|
55
|
+
}
|
|
56
|
+
else
|
|
57
|
+
pageless[type] ||= []
|
|
58
|
+
pageless[type] << obj
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
builder = Nokogiri::XML::Builder.new do |xml|
|
|
64
|
+
xml.pdf {
|
|
65
|
+
pageless.each_pair do |type, objs|
|
|
66
|
+
objs.each do |obj| write_obj_to_xml obj, type, xml end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
pages.each_pair do |page_number, obj_types|
|
|
70
|
+
xml.page(page_params[page_number]) {
|
|
71
|
+
obj_types.each_pair do |type, objs|
|
|
72
|
+
objs.each do |obj| write_obj_to_xml obj, type, xml end
|
|
73
|
+
end
|
|
74
|
+
}
|
|
75
|
+
end
|
|
76
|
+
}
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
builder.to_xml
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def write_obj_to_xml obj, type, xml
|
|
83
|
+
xml.send singular_name(type.to_s), get_xml_attributes(obj) do
|
|
84
|
+
|
|
85
|
+
unless @render_options[:outline]
|
|
86
|
+
if not @render_options[:lines]
|
|
87
|
+
xml.text Language::transliterate(Spatial.get_text_content obj)
|
|
88
|
+
elsif obj.key?(:content)
|
|
89
|
+
xml.text Language::transliterate(obj[:content].to_s)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
get_nested_objs(obj).each do |name, nested_obj|
|
|
94
|
+
element_name = singular_name name.to_s
|
|
95
|
+
if nested_obj.kind_of? Hash
|
|
96
|
+
write_obj_to_xml nested_obj, element_name, xml
|
|
97
|
+
elsif nested_obj.kind_of? Array
|
|
98
|
+
nested_obj.each do |item|
|
|
99
|
+
write_obj_to_xml item, element_name, xml
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def self.write render, filename
|
|
107
|
+
File.open filename, "w" do |file|
|
|
108
|
+
file.write render
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
end
|
|
113
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: pdf-extract
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
prerelease: false
|
|
5
|
+
segments:
|
|
6
|
+
- 0
|
|
7
|
+
- 0
|
|
8
|
+
- 1
|
|
9
|
+
version: 0.0.1
|
|
10
|
+
platform: ruby
|
|
11
|
+
authors:
|
|
12
|
+
- Karl Jonathan Ward
|
|
13
|
+
autorequire:
|
|
14
|
+
bindir: bin
|
|
15
|
+
cert_chain: []
|
|
16
|
+
|
|
17
|
+
date: 2011-10-21 00:00:00 +01:00
|
|
18
|
+
default_executable:
|
|
19
|
+
dependencies:
|
|
20
|
+
- !ruby/object:Gem::Dependency
|
|
21
|
+
name: pdf-reader
|
|
22
|
+
prerelease: false
|
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
|
24
|
+
none: false
|
|
25
|
+
requirements:
|
|
26
|
+
- - "="
|
|
27
|
+
- !ruby/object:Gem::Version
|
|
28
|
+
segments:
|
|
29
|
+
- 1
|
|
30
|
+
- 0
|
|
31
|
+
- 0
|
|
32
|
+
- beta1
|
|
33
|
+
version: 1.0.0.beta1
|
|
34
|
+
type: :runtime
|
|
35
|
+
version_requirements: *id001
|
|
36
|
+
- !ruby/object:Gem::Dependency
|
|
37
|
+
name: nokogiri
|
|
38
|
+
prerelease: false
|
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
|
40
|
+
none: false
|
|
41
|
+
requirements:
|
|
42
|
+
- - ">="
|
|
43
|
+
- !ruby/object:Gem::Version
|
|
44
|
+
segments:
|
|
45
|
+
- 1
|
|
46
|
+
- 5
|
|
47
|
+
- 0
|
|
48
|
+
version: 1.5.0
|
|
49
|
+
type: :runtime
|
|
50
|
+
version_requirements: *id002
|
|
51
|
+
- !ruby/object:Gem::Dependency
|
|
52
|
+
name: rmagick
|
|
53
|
+
prerelease: false
|
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
|
55
|
+
none: false
|
|
56
|
+
requirements:
|
|
57
|
+
- - ">="
|
|
58
|
+
- !ruby/object:Gem::Version
|
|
59
|
+
segments:
|
|
60
|
+
- 2
|
|
61
|
+
- 13
|
|
62
|
+
- 1
|
|
63
|
+
version: 2.13.1
|
|
64
|
+
type: :runtime
|
|
65
|
+
version_requirements: *id003
|
|
66
|
+
- !ruby/object:Gem::Dependency
|
|
67
|
+
name: prawn
|
|
68
|
+
prerelease: false
|
|
69
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
|
70
|
+
none: false
|
|
71
|
+
requirements:
|
|
72
|
+
- - ">="
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
segments:
|
|
75
|
+
- 0
|
|
76
|
+
- 11
|
|
77
|
+
- 1
|
|
78
|
+
version: 0.11.1
|
|
79
|
+
type: :runtime
|
|
80
|
+
version_requirements: *id004
|
|
81
|
+
- !ruby/object:Gem::Dependency
|
|
82
|
+
name: sqlite3
|
|
83
|
+
prerelease: false
|
|
84
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
|
85
|
+
none: false
|
|
86
|
+
requirements:
|
|
87
|
+
- - ">="
|
|
88
|
+
- !ruby/object:Gem::Version
|
|
89
|
+
segments:
|
|
90
|
+
- 1
|
|
91
|
+
- 3
|
|
92
|
+
- 4
|
|
93
|
+
version: 1.3.4
|
|
94
|
+
type: :runtime
|
|
95
|
+
version_requirements: *id005
|
|
96
|
+
- !ruby/object:Gem::Dependency
|
|
97
|
+
name: commander
|
|
98
|
+
prerelease: false
|
|
99
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
|
100
|
+
none: false
|
|
101
|
+
requirements:
|
|
102
|
+
- - ">="
|
|
103
|
+
- !ruby/object:Gem::Version
|
|
104
|
+
segments:
|
|
105
|
+
- 4
|
|
106
|
+
- 0
|
|
107
|
+
- 4
|
|
108
|
+
version: 4.0.4
|
|
109
|
+
type: :runtime
|
|
110
|
+
version_requirements: *id006
|
|
111
|
+
- !ruby/object:Gem::Dependency
|
|
112
|
+
name: json
|
|
113
|
+
prerelease: false
|
|
114
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
|
115
|
+
none: false
|
|
116
|
+
requirements:
|
|
117
|
+
- - ">="
|
|
118
|
+
- !ruby/object:Gem::Version
|
|
119
|
+
segments:
|
|
120
|
+
- 1
|
|
121
|
+
- 5
|
|
122
|
+
- 1
|
|
123
|
+
version: 1.5.1
|
|
124
|
+
type: :runtime
|
|
125
|
+
version_requirements: *id007
|
|
126
|
+
description:
|
|
127
|
+
email:
|
|
128
|
+
- kward@crossref.org
|
|
129
|
+
executables:
|
|
130
|
+
- pdf-extract
|
|
131
|
+
extensions: []
|
|
132
|
+
|
|
133
|
+
extra_rdoc_files: []
|
|
134
|
+
|
|
135
|
+
files:
|
|
136
|
+
- bin/assign.rb
|
|
137
|
+
- bin/config.json
|
|
138
|
+
- bin/fac_v19n11_s5.mask.pdf
|
|
139
|
+
- bin/margins.mask.pdf
|
|
140
|
+
- bin/one-column.mask.pdf
|
|
141
|
+
- bin/pdf-extract
|
|
142
|
+
- bin/s002040050107_Arch_Toxicol_1994_68_8.mask.pdf
|
|
143
|
+
- bin/some3.mask.pdf
|
|
144
|
+
- bin/some5.mask.pdf
|
|
145
|
+
- bin/some6.mask.pdf
|
|
146
|
+
- bin/train.rb
|
|
147
|
+
- bin/two-column.mask.pdf
|
|
148
|
+
- lib/analysis/columns.rb
|
|
149
|
+
- lib/analysis/margins.rb
|
|
150
|
+
- lib/analysis/sections.rb
|
|
151
|
+
- lib/analysis/titles.rb
|
|
152
|
+
- lib/analysis/zones.rb
|
|
153
|
+
- lib/font_metrics.rb
|
|
154
|
+
- lib/kmeans.rb
|
|
155
|
+
- lib/language.rb
|
|
156
|
+
- lib/model/characters.rb
|
|
157
|
+
- lib/model/chunks.rb
|
|
158
|
+
- lib/model/regions.rb
|
|
159
|
+
- lib/multi_range.rb
|
|
160
|
+
- lib/names.rb
|
|
161
|
+
- lib/pdf-extract.rb
|
|
162
|
+
- lib/pdf.rb
|
|
163
|
+
- lib/references/references.rb
|
|
164
|
+
- lib/references/resolve.rb
|
|
165
|
+
- lib/references/resolved_references.rb
|
|
166
|
+
- lib/spatial.rb
|
|
167
|
+
- lib/view/abstract_view.rb
|
|
168
|
+
- lib/view/pdf_view.rb
|
|
169
|
+
- lib/view/png_view.rb
|
|
170
|
+
- lib/view/xml_view.rb
|
|
171
|
+
- data/familynames.db
|
|
172
|
+
- data/stopwords.txt
|
|
173
|
+
has_rdoc: true
|
|
174
|
+
homepage: http://github.com/CrossRef/pdfextract
|
|
175
|
+
licenses: []
|
|
176
|
+
|
|
177
|
+
post_install_message:
|
|
178
|
+
rdoc_options: []
|
|
179
|
+
|
|
180
|
+
require_paths:
|
|
181
|
+
- lib
|
|
182
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
183
|
+
none: false
|
|
184
|
+
requirements:
|
|
185
|
+
- - ">="
|
|
186
|
+
- !ruby/object:Gem::Version
|
|
187
|
+
segments:
|
|
188
|
+
- 1
|
|
189
|
+
- 9
|
|
190
|
+
- 1
|
|
191
|
+
version: 1.9.1
|
|
192
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
193
|
+
none: false
|
|
194
|
+
requirements:
|
|
195
|
+
- - ">="
|
|
196
|
+
- !ruby/object:Gem::Version
|
|
197
|
+
segments:
|
|
198
|
+
- 0
|
|
199
|
+
version: "0"
|
|
200
|
+
requirements: []
|
|
201
|
+
|
|
202
|
+
rubyforge_project:
|
|
203
|
+
rubygems_version: 1.3.7
|
|
204
|
+
signing_key:
|
|
205
|
+
specification_version: 3
|
|
206
|
+
summary: PDF content extraction tool and library.
|
|
207
|
+
test_files: []
|
|
208
|
+
|