pdf-extract 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,77 @@
1
+ require_relative 'pdf'
2
+ require_relative 'model/characters'
3
+ require_relative 'model/chunks'
4
+ require_relative 'model/regions'
5
+ require_relative 'analysis/titles'
6
+ require_relative 'analysis/margins'
7
+ require_relative 'analysis/zones'
8
+ require_relative 'analysis/columns'
9
+ require_relative 'analysis/sections'
10
+ require_relative 'references/references'
11
+ require_relative 'references/resolved_references'
12
+ require_relative 'view/png_view'
13
+ require_relative 'view/pdf_view'
14
+ require_relative 'view/xml_view'
15
+
16
+ module PdfExtract
17
+
18
+ @views = {}
19
+
20
+ @parsers = []
21
+
22
+ def self.add_view name, view_class
23
+ @views[name] = view_class
24
+ end
25
+
26
+ def self.add_parser parser_class
27
+ @parsers << parser_class
28
+ end
29
+
30
+ def self.parse filename, &block
31
+ pdf = Pdf.new
32
+
33
+ @parsers.each do |p|
34
+ p.include_in pdf
35
+ end
36
+
37
+ yield pdf
38
+
39
+ pdf.spatial_calls.each do |spatial_call|
40
+ name = spatial_call[:name]
41
+ receiver = Receiver.new pdf
42
+ pdf.spatial_builders[name].call receiver
43
+ receiver.invoke_calls filename, pdf.spatial_options[name]
44
+ end
45
+
46
+ pdf
47
+ end
48
+
49
+ def self.view_class short_name
50
+ @views[short_name]
51
+ end
52
+
53
+ def self.view filename, options = {}, &block
54
+ pdf = parse filename, &block
55
+ view_class(options[:as]).new(pdf, filename).render options
56
+ end
57
+
58
+ def self.init
59
+ add_parser Characters
60
+ add_parser Chunks
61
+ add_parser Regions
62
+ add_parser Titles
63
+ add_parser Margins
64
+ add_parser Zones
65
+ add_parser Columns
66
+ add_parser Sections
67
+ add_parser References
68
+ add_parser ResolvedReferences
69
+
70
+ add_view :pdf, PdfView
71
+ add_view :png, PngView
72
+ add_view :xml, XmlView
73
+ end
74
+
75
+ init
76
+
77
+ end
data/lib/pdf.rb ADDED
@@ -0,0 +1,255 @@
1
+ require 'pdf-reader'
2
+
3
+ module PdfExtract
4
+
5
+ class Settings
6
+
7
+ @@defaults = {}
8
+
9
+ def self.default key, default_value
10
+ @@defaults[key] = default_value
11
+ end
12
+
13
+ def initialize
14
+ @settings = {}
15
+ @agents = {}
16
+ end
17
+
18
+ def [] key
19
+ @settings[key] || @@defaults[key] ||
20
+ raise("Attempt to use undeclared setting \"#{key}\"")
21
+ end
22
+
23
+ def set key, value, agent=""
24
+ if @@defaults[key]
25
+ @settings[key] = value.to_f
26
+ @agents[key] = agent
27
+ else
28
+ raise "Attempt to set an undefined setting \"#{key}\""
29
+ end
30
+ end
31
+
32
+ def unmodified
33
+ @@defaults.reject { |k, v| @settings[k] }
34
+ end
35
+
36
+ def modified
37
+ @settings
38
+ end
39
+
40
+ def agent key
41
+ @agents[key]
42
+ end
43
+
44
+ end
45
+
46
+ class Receiver
47
+
48
+ def initialize pdf
49
+ @pdf = pdf
50
+ @listeners = {}
51
+ @object_listeners = {}
52
+ end
53
+
54
+ def for callback_name, &block
55
+ @listeners[callback_name] = {:type => @pdf.operating_type, :fn => block}
56
+ end
57
+
58
+ def objects type_name, &block
59
+ @object_listeners[type_name] ||= []
60
+ @object_listeners[type_name] << block
61
+ end
62
+
63
+ def before &block
64
+ @before = block
65
+ end
66
+
67
+ def after &block
68
+ @after = {:type => @pdf.operating_type, :fn => block}
69
+ end
70
+
71
+ def expand_listeners_to_callback_methods
72
+ # TODO merge on callback_name
73
+ @listeners.each_pair do |callback_name, callback_handler|
74
+ p = proc do |*args|
75
+ spatial_objects = callback_handler[:fn].call args
76
+ self.add_spatial_objects callback_handler[:type], spatial_objects
77
+ end
78
+
79
+ self.class.send :define_method, callback_name, p
80
+ end
81
+ end
82
+
83
+ def call_object_listeners spatial_objects
84
+ @object_listeners.each_pair do |type, listeners|
85
+ listeners.each do |listener|
86
+ spatial_objects[type].each { |obj| listener.call obj }
87
+ end
88
+ end
89
+ end
90
+
91
+ def call_after
92
+ self.add_spatial_objects @after[:type], @after[:fn].call unless @after.nil?
93
+ end
94
+
95
+ def call_before
96
+ @before.call unless @before.nil?
97
+ end
98
+
99
+ def for_calls?
100
+ @listeners.size > 0
101
+ end
102
+
103
+ def object_calls?
104
+ @object_listeners.size > 0
105
+ end
106
+
107
+ def add_spatial_objects default_type, objs
108
+ if objs.class != Array
109
+ objs = [objs] unless objs.nil?
110
+ objs = [] if objs.nil?
111
+ end
112
+
113
+ objs.each do |obj|
114
+ type = obj.delete(:group) || default_type
115
+ @pdf.spatial_objects[type] ||= []
116
+ @pdf.spatial_objects[type] << obj
117
+ end
118
+ end
119
+
120
+ def invoke_calls filename, spatial_options
121
+ if spatial_options[:paged]
122
+
123
+ paged_objs = {}
124
+ @object_listeners.each_pair do |type, _|
125
+ @pdf.paged_objects(type).each_pair do |page, objs|
126
+ paged_objs[page] ||= {}
127
+ paged_objs[page][type] = objs
128
+ end
129
+ end
130
+
131
+ paged_objs.each_pair do |page, objs|
132
+ self.call_before
133
+
134
+ if self.object_calls?
135
+ @object_listeners.each_pair do |type, listeners|
136
+ listeners.each do |listener|
137
+ if objs[type].nil?
138
+ raise "#{@pdf.operating_type} is missing a dependency on #{type}"
139
+ end
140
+ objs[type].each { |obj| listener.call obj }
141
+ end
142
+ end
143
+ end
144
+
145
+ self.call_after
146
+ end
147
+
148
+ else
149
+
150
+ self.call_before
151
+ if self.object_calls?
152
+ self.call_object_listeners @pdf.spatial_objects
153
+ end
154
+ self.call_after
155
+
156
+ end
157
+
158
+ if self.for_calls?
159
+ self.expand_listeners_to_callback_methods
160
+ PDF::Reader.file filename, self, :raw_text => true
161
+ end
162
+ end
163
+
164
+ end
165
+
166
+ class Pdf
167
+
168
+ attr_accessor :operating_type, :spatial_calls, :spatial_builders, :spatial_objects
169
+ attr_accessor :spatial_options, :settings
170
+
171
+ def method_missing name, *args
172
+ raise "No such spatial type #{name}"
173
+ end
174
+
175
+ def spatials name, options = {}, &block
176
+ add_spatials_method name, options, &block
177
+ end
178
+
179
+ def initialize
180
+ @spatial_builders = {}
181
+ @spatial_calls = []
182
+ @spatial_objects = {}
183
+ @spatial_options = {}
184
+ @settings = Settings.new
185
+ end
186
+
187
+ def explicit_call? name
188
+ @spatial_calls.count { |obj| obj[:name] == name and obj[:explicit] } > 0
189
+ end
190
+
191
+ def paged_objects type
192
+ paged_objs = {}
193
+
194
+ if @spatial_objects[type]
195
+ @spatial_objects[type].each do |obj|
196
+ paged_objs[obj[:page]] ||= []
197
+ paged_objs[obj[:page]] << obj
198
+ end
199
+ end
200
+
201
+ paged_objs
202
+ end
203
+
204
+ def [](type)
205
+ @spatial_objects[type]
206
+ end
207
+
208
+ def set setting, value, agent=""
209
+ @settings.set setting, value, agent
210
+ end
211
+
212
+ private
213
+
214
+ def append_deps deps_list
215
+ # TODO if explicit is true, overwrite non-explicit deps.
216
+ deps_list.each do |dep|
217
+ append_deps @spatial_options[dep].fetch(:depends_on, [])
218
+ if @spatial_calls.count { |obj| obj[:name] == dep }.zero?
219
+ @spatial_calls << {
220
+ :name => dep,
221
+ :explicit => false
222
+ }
223
+ end
224
+ end
225
+ end
226
+
227
+ def add_spatials_method name, options={}, &block
228
+ options = {:depends_on => [], :defined_by => []}.merge options
229
+
230
+ @spatial_objects[name] = []
231
+ @spatial_builders[name] = proc { |receiver|
232
+ @operating_type = name
233
+ block.call receiver unless block.nil?
234
+ }
235
+ @spatial_options[name] = options
236
+
237
+ p = Proc.new do
238
+ append_deps options[:depends_on]
239
+
240
+ @spatial_calls << {
241
+ :name => name,
242
+ :explicit => true
243
+ }
244
+
245
+ @spatial_objects[name].each do |o|
246
+ yield o
247
+ end
248
+ end
249
+
250
+ self.class.send :define_method, name, p
251
+ end
252
+ end
253
+
254
+ end
255
+
@@ -0,0 +1,184 @@
1
+ require_relative "../spatial"
2
+
3
+ module PdfExtract
4
+ module References
5
+
6
+ Settings.default :min_score, 6.4
7
+ Settings.default :min_sequence_count, 3
8
+ Settings.default :max_reference_order, 1000
9
+
10
+ def self.partition_by ary, &block
11
+ matching = []
12
+ parts = []
13
+ ary.each do |item|
14
+ if yield(item)
15
+ parts << matching
16
+ matching = []
17
+ end
18
+ matching << item
19
+ end
20
+ parts.reject { |p| p.empty? }
21
+ end
22
+
23
+ def self.frequencies lines, delimit_key
24
+ fs = {}
25
+ lines.each do |line|
26
+ val = line[delimit_key].floor
27
+ fs[val] ||= 0
28
+ fs[val] = fs[val].next
29
+ end
30
+
31
+ ary = []
32
+ fs.each_pair do |key, val|
33
+ ary << {:value => key, :count => val}
34
+ end
35
+
36
+ ary.sort_by { |item| item[:count] }.reverse
37
+ end
38
+
39
+ def self.select_delimiter lines, delimit_key
40
+ frequencies(lines, delimit_key)[1][:value]
41
+ end
42
+
43
+ def self.split_by_margin lines
44
+ delimiting_x_offset = select_delimiter lines, :x_offset
45
+ lines = lines.drop_while { |l| l[:x_offset].floor != delimiting_x_offset }
46
+ parts = partition_by(lines) { |line| line[:x_offset].floor == delimiting_x_offset }
47
+ parts.map { |part| {:content => part.map { |line| line[:content] }.join(" ")} }
48
+ end
49
+
50
+ def self.split_by_line_spacing lines
51
+ delimiting_spacing = select_delimiter lines, :spacing
52
+ lines = lines.drop_while { |l| l[:spacing].floor != delimiting_spacing }
53
+ parts = partition_by(lines) { |line| line[:spacing].floor == delimiting_spacing }
54
+ parts.map { |part| {:content => part.map { |line| line[:content] }.join(" ")} }
55
+ end
56
+
57
+ def self.split_by_delimiter pdf, s
58
+ # Find sequential numbers and use them as partition points.
59
+
60
+ # Determine the charcaters that are most likely part of numeric
61
+ # delimiters.
62
+
63
+ after = {}
64
+ before = {}
65
+ last_n = -1
66
+
67
+ s.scan /[^\d]?\d+[^\d]/ do |m|
68
+ n = m[/\d+/].to_i
69
+ if n < pdf.settings[:max_reference_order]
70
+ if last_n == -1
71
+ before[m[0]] ||= 0
72
+ before[m[0]] = before[m[0]].next
73
+ after[m[-1]] ||= 0
74
+ after[m[-1]] = after[m[-1]].next
75
+ last_n = n
76
+ elsif n == last_n.next
77
+ before[m[0]] ||= 0
78
+ before[m[0]] = before[m[0]].next
79
+ after[m[-1]] ||= 0
80
+ after[m[-1]] = after[m[-1]].next
81
+ last_n = last_n.next
82
+ end
83
+ end
84
+ end
85
+
86
+ b_s = "" if before.length.zero?
87
+ b_s = "\\" + before.max_by { |_, v| v }[0] unless before.length.zero?
88
+ a_s = "" if after.length.zero?
89
+ a_s = "\\" + after.max_by { |_, v| v }[0] unless after.length.zero?
90
+
91
+ # TODO Turn into settings. Needs typed settings
92
+ if ["", "\\[", "\\ "].include?(b_s) && ["", "\\.", "\\]", "\\ "].include?(a_s)
93
+
94
+ # Split by the delimiters and record separate refs.
95
+
96
+ last_n = -1
97
+ current_ref = ""
98
+ refs = []
99
+ parts = s.partition(Regexp.new "#{b_s}?\\d+#{a_s}")
100
+
101
+ while not parts[1].length.zero?
102
+ n = parts[1][/\d+/].to_i
103
+ if n < pdf.settings[:max_reference_order] && last_n == -1
104
+ last_n = n
105
+ elsif n == last_n.next
106
+ current_ref += parts[0]
107
+ refs << {
108
+ :content => current_ref.strip,
109
+ :order => last_n
110
+ }
111
+ current_ref = ""
112
+ last_n = last_n.next
113
+ else
114
+ current_ref += parts[0] + parts[1]
115
+ end
116
+
117
+ parts = parts[2].partition(Regexp.new "#{b_s}?\\d+#{a_s}")
118
+ end
119
+
120
+ refs << {
121
+ :content => (current_ref + parts[0]).strip,
122
+ :order => last_n
123
+ }
124
+
125
+ refs
126
+
127
+ else
128
+ []
129
+ end
130
+ end
131
+
132
+ def self.multi_margin? lines
133
+ lines.uniq { |line| line[:x_offset].floor }.count > 1
134
+ end
135
+
136
+ def self.multi_spacing? lines
137
+ lines.uniq { |line| line[:spacing].floor }.count > 1
138
+ end
139
+
140
+ def self.numeric_sequence? pdf, content
141
+ last_n = -1
142
+ seq_count = 0
143
+ content.scan /\d+/ do |m|
144
+ # Avoid misinterpreting years as sequence
145
+ if m.to_i < pdf.settings[:max_reference_order]
146
+ if last_n == -1
147
+ last_n = m.to_i
148
+ elsif last_n.next == m.to_i
149
+ last_n = last_n.next
150
+ seq_count = seq_count.next
151
+ end
152
+ end
153
+ end
154
+
155
+ seq_count >= pdf.settings[:min_sequence_count]
156
+ end
157
+
158
+ def self.include_in pdf
159
+ pdf.spatials :references, :depends_on => [:sections] do |parser|
160
+
161
+ refs = []
162
+
163
+ parser.objects :sections do |section|
164
+ # TODO Take top x%, fix Infinity coming back from score.
165
+ if section[:reference_score] >= pdf.settings[:min_score]
166
+ if numeric_sequence? pdf, Spatial.get_text_content(section)
167
+ refs += split_by_delimiter pdf, Spatial.get_text_content(section)
168
+ elsif multi_margin? section[:lines]
169
+ refs += split_by_margin section[:lines]
170
+ elsif multi_spacing? section[:lines]
171
+ refs += split_by_line_spacing section[:lines]
172
+ end
173
+ end
174
+ end
175
+
176
+ parser.after do
177
+ refs
178
+ end
179
+
180
+ end
181
+ end
182
+
183
+ end
184
+ end
@@ -0,0 +1,113 @@
1
+ require 'cgi'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'net/http'
5
+
6
+ module PdfExtract::Resolve
7
+
8
+ class Sigg
9
+
10
+ def self.find ref
11
+ url = "http://api.labs.crossref.org/search?q=#{CGI.escape(ref)}"
12
+ resolved = {}
13
+ begin
14
+ doc = Nokogiri::HTML(open url)
15
+
16
+ result = doc.at_css "div.result"
17
+ unless result.nil?
18
+ score = result.at_css("span.cr_score").content.to_s
19
+ if score.to_i >= 90
20
+ doi = result.at_css "span.doi"
21
+ resolved[:doi] = doi.content.sub "http://dx.doi.org/", ""
22
+ end
23
+ end
24
+ rescue
25
+ end
26
+ resolved
27
+ end
28
+
29
+ end
30
+
31
+ class FreeCite
32
+
33
+ def self.find ref
34
+ Net::HTTP.start "freecite.library.brown.edu" do |http|
35
+ r = http.post "/citations/create", "citation=#{ref}",
36
+ "Accept" => "text/xml"
37
+ doc = Nokogiri::XML r.body
38
+
39
+ {
40
+ :title => doc.at_xpath("//title").content,
41
+ :journal => doc.at_xpath("//journal").content,
42
+ :pages => doc.at_xpath("//pages").content,
43
+ :year => doc.at_xpath("//year").content
44
+ }
45
+ end
46
+ end
47
+
48
+ end
49
+
50
+ class SimpleTextQuery
51
+
52
+ @@cookie = nil
53
+
54
+ def self.find ref
55
+ create_session
56
+
57
+ post = Net::HTTP::Post.new "/SimpleTextQuery"
58
+ post.add_field "Cookie", @@cookie
59
+ post.add_field "Referer", "http://www.crossref.org/SimpleTextQuery"
60
+ post.set_form_data({
61
+ "command" => "Submit",
62
+ "freetext" => ref,
63
+ #"emailField" => "kward@crossref.org",
64
+ "doiField" => "",
65
+ #"username" => "",
66
+ #"password" => ""
67
+ })
68
+ response = Net::HTTP.start "www.crossref.org" do |http|
69
+ http.request post
70
+ end
71
+
72
+ doc = Nokogiri::HTML response.body
73
+ doi = doc.at_css "td.resultB > a"
74
+
75
+ if doi.nil?
76
+ {}
77
+ else
78
+ {:doi => doi.content.sub("doi:", "")}
79
+ end
80
+ end
81
+
82
+ def self.create_session
83
+ if @@cookie.nil?
84
+ Net::HTTP.start "www.crossref.org" do |http|
85
+ response = http.get "/SimpleTextQuery"
86
+ @@cookie = response["Set-Cookie"]
87
+ end
88
+ end
89
+ end
90
+
91
+ end
92
+
93
+ @@resolvers = [Sigg]
94
+
95
+ def self.resolvers= resolver
96
+ @@resolvers = resolver
97
+ end
98
+
99
+ def self.add_resolver resolver
100
+ unless @@resolvers.contains? resolver
101
+ @@resolvers << resolver
102
+ end
103
+ end
104
+
105
+ def self.find ref
106
+ ref = ref.dup
107
+ @@resolvers.each do |resolver|
108
+ ref.merge! resolver.find(ref[:content])
109
+ end
110
+ ref
111
+ end
112
+
113
+ end
@@ -0,0 +1,37 @@
1
+ require_relative 'resolve'
2
+
3
+ module PdfExtract
4
+ module ResolvedReferences
5
+
6
+ def self.include_in pdf
7
+ pdf.spatials :resolved_references, :depends_on => [:references] do |parser|
8
+
9
+ resolved_refs = []
10
+
11
+ parser.objects :references do |ref|
12
+ resolved_refs << ref.merge(Resolve.find(ref))
13
+ end
14
+
15
+ parser.after do
16
+ resolved_refs
17
+ end
18
+
19
+ end
20
+ end
21
+
22
+ def self.reverse_resolve ref
23
+
24
+ url = "http://api.labs.crossref.org/search?q=#{CGI.escape(ref)}"
25
+ doc = Nokogiri::HTML(open url)
26
+
27
+ result = doc.at_css "div.result"
28
+ score = result.at_css("span.cr_score").content.to_s
29
+ if score.to_i >= 90
30
+ result.at_css("span.doi").content.sub("http://dx.doi.org/", "")
31
+ else
32
+ ""
33
+ end
34
+ end
35
+
36
+ end
37
+ end