pdf-extract 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/spatial.rb ADDED
@@ -0,0 +1,188 @@
1
+
2
+ module PdfExtract
3
+ module Spatial
4
+
5
+ @@default_options = {
6
+ :separator => '',
7
+ :lines => false,
8
+ :write_mode => :left_to_right
9
+ }
10
+
11
+ @@spatial_attribs = [:x, :y, :width, :height, :page_width, :page_height, :page]
12
+
13
+ def self.concat_lines top, bottom
14
+ if top =~ /\-\Z/
15
+ top[0..-2] + bottom
16
+ else
17
+ top + ' ' + bottom
18
+ end
19
+ end
20
+
21
+ def self.drop_spatial obj
22
+ obj.dup.delete_if { |k, v| @@spatial_attribs.include? k }
23
+ end
24
+
25
+ def self.merge_lines a, b, so
26
+ so[:lines] = []
27
+
28
+ if a.key? :lines
29
+ so[:lines] += a[:lines]
30
+ else
31
+ so[:lines] << as_line(a)
32
+ end
33
+
34
+ if b.key? :lines
35
+ so[:lines] += b[:lines]
36
+ else
37
+ so[:lines] << as_line(b)
38
+ end
39
+
40
+ so
41
+ end
42
+
43
+ def self.merge a, b, options={}
44
+ options = @@default_options.merge options
45
+
46
+ bottom_left = [ [a[:x], b[:x]].min, [a[:y], b[:y]].min ]
47
+ top_right = [ [a[:x] + a[:width], b[:x] + b[:width]].max,
48
+ [a[:y] + a[:height], b[:y] + b[:height]].max ]
49
+
50
+ so = a.merge(b).merge({
51
+ :x => bottom_left[0],
52
+ :y => bottom_left[1],
53
+ :width => top_right[0] - bottom_left[0],
54
+ :height => top_right[1] - bottom_left[1]
55
+ })
56
+
57
+ if options[:lines]
58
+ merge_lines a, b, so
59
+ else
60
+ so[:content] = (a[:content] + options[:separator] + b[:content])
61
+ so[:content] = so[:content].gsub /\s+/, " "
62
+ end
63
+
64
+ if get_text_content(a).length > get_text_content(b).length
65
+ so[:font] = a[:font]
66
+ so[:line_height] = a[:line_height]
67
+ else
68
+ so[:font] = b[:font]
69
+ so[:line_height] = b[:line_height]
70
+ end
71
+
72
+ so
73
+ end
74
+
75
+ def self.line_count obj
76
+ line_count = 0
77
+ line_count += obj[:content].count("\n") + 1 if obj[:content]
78
+ line_count += obj[:lines].length if obj[:lines]
79
+ line_count
80
+ end
81
+
82
+ def self.get_dimensions obj
83
+ {
84
+ :x => obj[:x],
85
+ :y => obj[:y],
86
+ :width => obj[:width],
87
+ :height => obj[:height]
88
+ }
89
+ end
90
+
91
+ def self.as_line obj
92
+ get_dimensions(obj).merge({:content => obj[:content]})
93
+ end
94
+
95
+ def self.get_text_content obj
96
+ if obj[:lines]
97
+ obj[:lines].map do |line|
98
+ if line[:content] =~ /\-\Z/
99
+ line[:content][0..-2]
100
+ else
101
+ line[:content] + " "
102
+ end
103
+ end.join("").strip
104
+ elsif obj[:content]
105
+ obj[:content]
106
+ else
107
+ obj
108
+ end
109
+ end
110
+
111
+ # Collapse a list of objects into one. Will merge objects in the
112
+ # correct write order, specified by write_mode.
113
+ def self.collapse objs, options={}
114
+ options = @@default_options.merge options
115
+
116
+ sorted = case write_mode
117
+ when :left_to_right
118
+ objs.sort_by { |obj| -(obj[:y].floor * 100) + (obj[:x] / 100.0) }
119
+ end
120
+
121
+ if sorted.count == 1
122
+ sorted.first.dup
123
+ else
124
+ o = sorted.delete_at(0).dup
125
+ while not sorted.count.zero?
126
+ merge o, sorted.delete_at(0)
127
+ end
128
+ o
129
+ end
130
+ end
131
+
132
+ def self.contains? a, b
133
+ a_x1 = a[:x]
134
+ a_x2 = a[:x] + a[:width]
135
+ a_y1 = a[:y]
136
+ a_y2 = a[:y] + a[:height]
137
+
138
+ b_x1 = b[:x]
139
+ b_x2 = b[:x] + b[:width]
140
+ b_y1 = b[:y]
141
+ b_y2 = b[:y] + b[:height]
142
+
143
+ b_x1 >= a_x1 && b_x2 <= a_x2 && b_y1 >= a_y1 && b_y2 <= a_y2
144
+ end
145
+
146
+ def self.overlap? from, by, a, b
147
+ a_top = a[from] + a[by]
148
+ b_top = b[rom] + b[by]
149
+
150
+ (b_top <= a_top && b_top >= a[from]) || (b[from] >= a[from] && b[from] <= b_top)
151
+ end
152
+
153
+ def self.score items, ideals
154
+ types = {}
155
+ ideals.keys.each do |name|
156
+ types[name] = ideals[name].keys
157
+ end
158
+
159
+ types.each do |name, vars|
160
+ score_name = (name.to_s + "_score").to_sym
161
+
162
+ vars.each do |var_name|
163
+
164
+ scores = []
165
+ items.each do |item|
166
+ diff = (item[var_name] - ideals[name][var_name][0]).abs
167
+ if diff.zero?
168
+ diff = Float::MIN
169
+ end
170
+ scores << 1.0 / diff
171
+ end
172
+
173
+ score_max = scores.max
174
+ weighted_scores = scores.map do |score|
175
+ (score / score_max) * ideals[name][var_name][1]
176
+ end
177
+
178
+ items.each_index do |idx|
179
+ items[idx][score_name] ||= 0.0
180
+ items[idx][score_name] += weighted_scores[idx]
181
+ end
182
+
183
+ end
184
+ end
185
+ end
186
+
187
+ end
188
+ end
@@ -0,0 +1,32 @@
1
+ module PdfExtract
2
+ class AbstractView
3
+
4
+ @@auto_colors = ["ff0000", "00ff00", "0000ff", "ffff00",
5
+ "ff7f00", "ffc0cb", "800080", "f0e68c",
6
+ "a52a2a"]
7
+
8
+ def initialize pdf, filename
9
+ @pdf = pdf
10
+ @filename = filename
11
+ end
12
+
13
+ # Return renderable objects - those whose spatials method was
14
+ # called explicitly.
15
+ def objects
16
+ @pdf.spatial_objects.reject { |type, _| not @pdf.explicit_call? type }
17
+ end
18
+
19
+ def auto_color
20
+ @next_auto_color = 0 if @next_auto_color.nil?
21
+ color = @@auto_colors[@next_auto_color]
22
+ @next_auto_color = @next_auto_color.next
23
+ color
24
+ end
25
+
26
+ def singular_name name
27
+ name = name.sub /ies$/, 'y'
28
+ name = name.sub /s$/, ''
29
+ end
30
+
31
+ end
32
+ end
@@ -0,0 +1,43 @@
1
+ require 'prawn'
2
+ require_relative 'abstract_view'
3
+
4
+ module PdfExtract
5
+ class PdfView < AbstractView
6
+
7
+ def render options={}
8
+ Prawn::Document.new :template => @filename do |doc|
9
+ objects.each_pair do |type, objs|
10
+ last_page = 1
11
+ color = auto_color
12
+ doc.go_to_page last_page
13
+ doc.fill_color color
14
+
15
+ objs.each do |obj|
16
+ unless obj[:page].nil?
17
+ if obj[:page] != last_page
18
+ last_page = obj[:page]
19
+ doc.go_to_page last_page
20
+ doc.fill_color color
21
+ end
22
+
23
+ # XXX Works, but why?
24
+ pos = [obj[:x] - 36, obj[:y] + obj[:height] - 36]
25
+ width = obj[:width]
26
+ height = obj[:height]
27
+
28
+ doc.transparent 0.2 do
29
+ doc.fill_rectangle pos, width, height
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ def self.write render, filename
38
+ render.render_file filename
39
+ end
40
+
41
+ end
42
+ end
43
+
@@ -0,0 +1,30 @@
1
+ require 'RMagick'
2
+
3
+ require_relative 'abstract_view'
4
+
5
+ module PdfExtract
6
+ class PngView < AbstractView
7
+
8
+ def render options={}
9
+ img = Magick::Image.new(800, 1000) { self.background_color = "white" }
10
+
11
+ objects.each_pair do |type, objs|
12
+ color = auto_color
13
+ objs.each do |obj|
14
+ gc = Magick::Draw.new
15
+ gc.fill = "\##{color}"
16
+ gc.rectangle(obj[:x], obj[:y], obj[:x] + obj[:width],
17
+ obj[:y] + obj[:height])
18
+ gc.draw img
19
+ end
20
+ end
21
+
22
+ img
23
+ end
24
+
25
+ def self.write render, filename
26
+ render.write filename
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,113 @@
1
+ require 'nokogiri'
2
+
3
+ require_relative 'abstract_view'
4
+ require_relative '../language'
5
+
6
+ module PdfExtract
7
+ class XmlView < AbstractView
8
+
9
+ @@ignored_attributes = [:content, :page, :page_width, :page_height]
10
+
11
+ @@numeric_attributes = [:x, :y, :width, :height, :line_height,
12
+ :page_height, :page_width, :x_offset, :y_offset,
13
+ :spacing, :letter_ratio, :cap_ratio, :year_ratio]
14
+
15
+ # Return renderable attributes
16
+ def get_xml_attributes obj
17
+ attribs = obj.reject { |k, _| @@ignored_attributes.include? k }
18
+ attribs = attribs.reject { |_, v| v.kind_of?(Hash) || v.kind_of?(Array) }
19
+ attribs.each_pair do |k, v|
20
+ if @@numeric_attributes.include?(k) || k.to_s =~ /.+_score/
21
+ attribs[k] = v.round(@render_options[:round])
22
+ end
23
+ end
24
+ attribs
25
+ end
26
+
27
+ def get_nested_objs obj
28
+ nested = obj.reject { |_, v| ! (v.kind_of?(Hash) || v.kind_of?(Array)) }
29
+ if @render_options[:lines]
30
+ nested
31
+ else
32
+ nested.reject { |k, _| k == :lines }
33
+ end
34
+ end
35
+
36
+ def render options={}
37
+ @render_options = {:lines => true, :round => 2, :outline => false}.merge(options)
38
+
39
+ pages = {}
40
+ page_params = {}
41
+ pageless = {}
42
+
43
+ objects.each_pair do |type, objs|
44
+ objs.each do |obj|
45
+ if obj.key? :page
46
+ pages[obj[:page]] ||= {}
47
+ pages[obj[:page]][type] ||= []
48
+
49
+ pages[obj[:page]][type] << obj
50
+
51
+ page_params[obj[:page]] ||= {
52
+ :width => obj[:page_width],
53
+ :height => obj[:page_height],
54
+ :number => obj[:page]
55
+ }
56
+ else
57
+ pageless[type] ||= []
58
+ pageless[type] << obj
59
+ end
60
+ end
61
+ end
62
+
63
+ builder = Nokogiri::XML::Builder.new do |xml|
64
+ xml.pdf {
65
+ pageless.each_pair do |type, objs|
66
+ objs.each do |obj| write_obj_to_xml obj, type, xml end
67
+ end
68
+
69
+ pages.each_pair do |page_number, obj_types|
70
+ xml.page(page_params[page_number]) {
71
+ obj_types.each_pair do |type, objs|
72
+ objs.each do |obj| write_obj_to_xml obj, type, xml end
73
+ end
74
+ }
75
+ end
76
+ }
77
+ end
78
+
79
+ builder.to_xml
80
+ end
81
+
82
+ def write_obj_to_xml obj, type, xml
83
+ xml.send singular_name(type.to_s), get_xml_attributes(obj) do
84
+
85
+ unless @render_options[:outline]
86
+ if not @render_options[:lines]
87
+ xml.text Language::transliterate(Spatial.get_text_content obj)
88
+ elsif obj.key?(:content)
89
+ xml.text Language::transliterate(obj[:content].to_s)
90
+ end
91
+ end
92
+
93
+ get_nested_objs(obj).each do |name, nested_obj|
94
+ element_name = singular_name name.to_s
95
+ if nested_obj.kind_of? Hash
96
+ write_obj_to_xml nested_obj, element_name, xml
97
+ elsif nested_obj.kind_of? Array
98
+ nested_obj.each do |item|
99
+ write_obj_to_xml item, element_name, xml
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
105
+
106
+ def self.write render, filename
107
+ File.open filename, "w" do |file|
108
+ file.write render
109
+ end
110
+ end
111
+
112
+ end
113
+ end
metadata ADDED
@@ -0,0 +1,208 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdf-extract
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Karl Jonathan Ward
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-10-21 00:00:00 +01:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: pdf-reader
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - "="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 1
30
+ - 0
31
+ - 0
32
+ - beta1
33
+ version: 1.0.0.beta1
34
+ type: :runtime
35
+ version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
37
+ name: nokogiri
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ segments:
45
+ - 1
46
+ - 5
47
+ - 0
48
+ version: 1.5.0
49
+ type: :runtime
50
+ version_requirements: *id002
51
+ - !ruby/object:Gem::Dependency
52
+ name: rmagick
53
+ prerelease: false
54
+ requirement: &id003 !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ segments:
60
+ - 2
61
+ - 13
62
+ - 1
63
+ version: 2.13.1
64
+ type: :runtime
65
+ version_requirements: *id003
66
+ - !ruby/object:Gem::Dependency
67
+ name: prawn
68
+ prerelease: false
69
+ requirement: &id004 !ruby/object:Gem::Requirement
70
+ none: false
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ segments:
75
+ - 0
76
+ - 11
77
+ - 1
78
+ version: 0.11.1
79
+ type: :runtime
80
+ version_requirements: *id004
81
+ - !ruby/object:Gem::Dependency
82
+ name: sqlite3
83
+ prerelease: false
84
+ requirement: &id005 !ruby/object:Gem::Requirement
85
+ none: false
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ segments:
90
+ - 1
91
+ - 3
92
+ - 4
93
+ version: 1.3.4
94
+ type: :runtime
95
+ version_requirements: *id005
96
+ - !ruby/object:Gem::Dependency
97
+ name: commander
98
+ prerelease: false
99
+ requirement: &id006 !ruby/object:Gem::Requirement
100
+ none: false
101
+ requirements:
102
+ - - ">="
103
+ - !ruby/object:Gem::Version
104
+ segments:
105
+ - 4
106
+ - 0
107
+ - 4
108
+ version: 4.0.4
109
+ type: :runtime
110
+ version_requirements: *id006
111
+ - !ruby/object:Gem::Dependency
112
+ name: json
113
+ prerelease: false
114
+ requirement: &id007 !ruby/object:Gem::Requirement
115
+ none: false
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ segments:
120
+ - 1
121
+ - 5
122
+ - 1
123
+ version: 1.5.1
124
+ type: :runtime
125
+ version_requirements: *id007
126
+ description:
127
+ email:
128
+ - kward@crossref.org
129
+ executables:
130
+ - pdf-extract
131
+ extensions: []
132
+
133
+ extra_rdoc_files: []
134
+
135
+ files:
136
+ - bin/assign.rb
137
+ - bin/config.json
138
+ - bin/fac_v19n11_s5.mask.pdf
139
+ - bin/margins.mask.pdf
140
+ - bin/one-column.mask.pdf
141
+ - bin/pdf-extract
142
+ - bin/s002040050107_Arch_Toxicol_1994_68_8.mask.pdf
143
+ - bin/some3.mask.pdf
144
+ - bin/some5.mask.pdf
145
+ - bin/some6.mask.pdf
146
+ - bin/train.rb
147
+ - bin/two-column.mask.pdf
148
+ - lib/analysis/columns.rb
149
+ - lib/analysis/margins.rb
150
+ - lib/analysis/sections.rb
151
+ - lib/analysis/titles.rb
152
+ - lib/analysis/zones.rb
153
+ - lib/font_metrics.rb
154
+ - lib/kmeans.rb
155
+ - lib/language.rb
156
+ - lib/model/characters.rb
157
+ - lib/model/chunks.rb
158
+ - lib/model/regions.rb
159
+ - lib/multi_range.rb
160
+ - lib/names.rb
161
+ - lib/pdf-extract.rb
162
+ - lib/pdf.rb
163
+ - lib/references/references.rb
164
+ - lib/references/resolve.rb
165
+ - lib/references/resolved_references.rb
166
+ - lib/spatial.rb
167
+ - lib/view/abstract_view.rb
168
+ - lib/view/pdf_view.rb
169
+ - lib/view/png_view.rb
170
+ - lib/view/xml_view.rb
171
+ - data/familynames.db
172
+ - data/stopwords.txt
173
+ has_rdoc: true
174
+ homepage: http://github.com/CrossRef/pdfextract
175
+ licenses: []
176
+
177
+ post_install_message:
178
+ rdoc_options: []
179
+
180
+ require_paths:
181
+ - lib
182
+ required_ruby_version: !ruby/object:Gem::Requirement
183
+ none: false
184
+ requirements:
185
+ - - ">="
186
+ - !ruby/object:Gem::Version
187
+ segments:
188
+ - 1
189
+ - 9
190
+ - 1
191
+ version: 1.9.1
192
+ required_rubygems_version: !ruby/object:Gem::Requirement
193
+ none: false
194
+ requirements:
195
+ - - ">="
196
+ - !ruby/object:Gem::Version
197
+ segments:
198
+ - 0
199
+ version: "0"
200
+ requirements: []
201
+
202
+ rubyforge_project:
203
+ rubygems_version: 1.3.7
204
+ signing_key:
205
+ specification_version: 3
206
+ summary: PDF content extraction tool and library.
207
+ test_files: []
208
+