pdf-extract 0.0.10 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -29,7 +29,7 @@ module PdfExtract
29
29
  @ascent = 0
30
30
  @descent = 0
31
31
  @bbox = [0, 0, 0, 0]
32
-
32
+
33
33
  base_font = font.basefont.to_s
34
34
  if @@base_fonts.key? base_font
35
35
  @ascent = @@base_fonts[base_font][:Ascent]
@@ -37,12 +37,20 @@ module PdfExtract
37
37
  @bbox = @@base_fonts[base_font][:FontBBox]
38
38
  @glyph_width_lookup = proc { |c|
39
39
  @@base_fonts[base_font][:Widths].fetch(c.codepoints.first, 0)
40
- }
40
+ }
41
41
  else
42
42
  @ascent = font.ascent
43
43
  @descent = font.descent
44
44
  @bbox = font.bbox
45
- @glyph_width_lookup = proc { |c| font.glyph_width c }
45
+ @glyph_width_lookup = proc do |c|
46
+ begin
47
+ font.glyph_width c.codepoints.first
48
+ rescue TypeError => e
49
+ # It seems some fonts don't have a first char attribute in their
50
+ # descriptor and this causes problems for pdf-reader.
51
+ 0
52
+ end
53
+ end
46
54
  end
47
55
 
48
56
  if not @bbox.nil?
@@ -17,7 +17,7 @@ module PdfExtract::Language
17
17
  when "\ufb05" then r << "ft"
18
18
  when "\ufb06" then r << "st"
19
19
  when "\u1d6b" then r << "ue"
20
-
20
+
21
21
  # Normalise some punctuation.
22
22
  when "\u2018" then r << "'"
23
23
  when "\u2019" then r << "'"
@@ -33,19 +33,19 @@ module PdfExtract::Language
33
33
  r << c
34
34
  end
35
35
  end
36
-
36
+
37
37
  r.gsub /\s+/, " "
38
38
  end
39
-
39
+
40
40
  def self.letter_ratio s
41
41
  s.count("A-Z0-9\-[],.\"'()") / s.length.to_f
42
42
  end
43
-
43
+
44
44
  # TODO Ignore caps in middle of words
45
45
  def self.cap_ratio s
46
46
  sentence_end = true
47
47
  cap_count = 0
48
-
48
+
49
49
  s.each_char do |c|
50
50
  if c =~ /\./
51
51
  sentence_end = true
@@ -56,13 +56,13 @@ module PdfExtract::Language
56
56
  sentence_end = false
57
57
  end
58
58
  end
59
-
59
+
60
60
  cap_count / s.split.length.to_f
61
61
  end
62
-
62
+
63
63
  def self.year_ratio s
64
64
  words = s.split
65
-
65
+
66
66
  year_words = words.map do |word|
67
67
  word =~ /[^\d]\d{4}[^\d]/
68
68
  end
@@ -77,6 +77,6 @@ module PdfExtract::Language
77
77
  def self.word_count s
78
78
  s.split.count
79
79
  end
80
-
80
+
81
81
  end
82
82
 
@@ -24,14 +24,14 @@ module PdfExtract
24
24
  }
25
25
 
26
26
  def self.include_in pdf
27
-
27
+
28
28
  pdf.spatials :chunks, :paged => true, :depends_on => [:characters] do |parser|
29
29
  rows = {}
30
30
 
31
31
  parser.before do
32
32
  rows = {}
33
33
  end
34
-
34
+
35
35
  parser.objects :characters do |chars|
36
36
  y = chars[:y]
37
37
  rows[y] = [] if rows[y].nil?
@@ -48,7 +48,7 @@ module PdfExtract
48
48
  char_slop = pdf.settings[:char_slop]
49
49
  word_slop = pdf.settings[:word_slop]
50
50
  overlap_slop = pdf.settings[:overlap_slop]
51
-
51
+
52
52
  text_chunks = []
53
53
 
54
54
  rows.each_pair do |y, row|
@@ -105,7 +105,11 @@ module PdfExtract
105
105
  end
106
106
 
107
107
  merged_text_chunks << text_chunks.first
108
- end
108
+
109
+ # Remove empty lines - they mess up region detection by
110
+ # making them join together.
111
+ merged_text_chunks.reject { |chunk| chunk[:content].strip == "" }
112
+ end
109
113
  end
110
114
  end
111
115
 
@@ -41,7 +41,7 @@ to be part of the same region. :line_slop is multiplied by the average line heig
41
41
  height_taken = from_top + line[:height]
42
42
  end
43
43
  end
44
-
44
+
45
45
  def self.include_in pdf
46
46
  pdf.spatials :regions, :paged => true, :depends_on => [:chunks] do |parser|
47
47
  chunks = []
@@ -51,7 +51,7 @@ to be part of the same region. :line_slop is multiplied by the average line heig
51
51
  chunks = []
52
52
  regions = []
53
53
  end
54
-
54
+
55
55
  parser.objects :chunks do |chunk|
56
56
  y = chunk[:y].floor
57
57
 
@@ -72,16 +72,16 @@ to be part of the same region. :line_slop is multiplied by the average line heig
72
72
  chunk[:lines] = [Spatial.as_line(chunk)]
73
73
  chunk.delete :content
74
74
  end
75
-
75
+
76
76
  compare_index = 1
77
77
  while chunks.count > compare_index
78
78
  b = chunks.first
79
79
  t = chunks[compare_index]
80
-
80
+
81
81
  line_height = b[:line_height]
82
82
  line_slop = [line_height, t[:height]].min * pdf.settings[:line_slop]
83
83
  incident_y = (b[:y] + b[:height] + line_slop) >= t[:y]
84
-
84
+
85
85
  if incident_y && incident(t, b)
86
86
  chunks[0] = Spatial.merge t, b, :lines => true
87
87
  chunks.delete_at compare_index
@@ -96,7 +96,7 @@ to be part of the same region. :line_slop is multiplied by the average line heig
96
96
  compare_index = 1
97
97
  end
98
98
  end
99
-
99
+
100
100
  regions << chunks.first unless chunks.first.nil?
101
101
 
102
102
  regions.each do |region|
@@ -112,6 +112,6 @@ to be part of the same region. :line_slop is multiplied by the average line heig
112
112
  end
113
113
  end
114
114
  end
115
-
115
+
116
116
  end
117
117
  end
@@ -15,7 +15,7 @@ module PdfExtract
15
15
  r.include?(range.min) || r.include?(range.max) ||
16
16
  range.include?(r.min) || range.include?(r.max)
17
17
  end
18
-
18
+
19
19
  incident << range
20
20
 
21
21
  non_incident = @ranges - incident
@@ -46,11 +46,21 @@ module PdfExtract
46
46
  end
47
47
 
48
48
  def max
49
- @max ||= @ranges.sort_by { |r| -r.max }.first.max
49
+ @ranges.sort_by { |r| -r.max }.first.max
50
50
  end
51
51
 
52
52
  def min
53
- @min ||= @ranges.sort_by { |r| r.min }.first.min
53
+ @ranges.sort_by { |r| r.min }.first.min
54
+ end
55
+
56
+ def widest
57
+ widest = @ranges.sort_by { |r| r.max - r.min }.last
58
+ widest.max - widest.min
59
+ end
60
+
61
+ def narrowest
62
+ narrowest = @ranges.sort_by { |r| r.max - r.min }.first
63
+ narrowest.max - narrowest.min
54
64
  end
55
65
 
56
66
  def avg
@@ -9,7 +9,6 @@ require_relative 'analysis/columns'
9
9
  require_relative 'analysis/sections'
10
10
  require_relative 'references/references'
11
11
  require_relative 'references/resolved_references'
12
- require_relative 'view/png_view'
13
12
  require_relative 'view/pdf_view'
14
13
  require_relative 'view/xml_view'
15
14
 
@@ -68,7 +67,6 @@ module PdfExtract
68
67
  add_parser ResolvedReferences
69
68
 
70
69
  add_view :pdf, PdfView
71
- add_view :png, PngView
72
70
  add_view :xml, XmlView
73
71
  end
74
72
 
@@ -6,7 +6,7 @@ module PdfExtract
6
6
  module References
7
7
 
8
8
  Settings.declare :reference_flex, {
9
- :default => 0.1,
9
+ :default => 0.2,
10
10
  :module => self.name,
11
11
  :description => "Article sections are given a score as potential reference sections. Their score is based on article section features, such as the number of family names that appear, the ratio of uppercase letters to lowercase, and so on. Any article section that has a score that is more than 1 - :reference_flex percent of the best score will be parsed as a reference section."
12
12
  }
@@ -16,7 +16,7 @@ module PdfExtract
16
16
  :module => self.name,
17
17
  :description => "There must be :min_sequence_count or more numbered references within a candidate reference section for them to be parsed as number-delimited references."
18
18
  }
19
-
19
+
20
20
  Settings.declare :max_reference_order, {
21
21
  :default => 1000,
22
22
  :module => self.name,
@@ -82,11 +82,11 @@ module PdfExtract
82
82
 
83
83
  # Determine the charcaters that are most likely part of numeric
84
84
  # delimiters.
85
-
85
+
86
86
  after = {}
87
87
  before = {}
88
88
  last_n = -1
89
-
89
+
90
90
  s.scan /[^\d]?\d+[^\d]/ do |m|
91
91
  n = m[/\d+/].to_i
92
92
  if n < pdf.settings[:max_reference_order]
@@ -115,14 +115,14 @@ module PdfExtract
115
115
  if ["", "\\[", "\\ "].include?(b_s) && ["", "\\.", "\\]", "\\ "].include?(a_s)
116
116
 
117
117
  # Split by the delimiters and record separate refs.
118
-
118
+
119
119
  last_n = -1
120
120
  current_ref = ""
121
121
  refs = []
122
122
  parts = s.partition(Regexp.new "#{b_s}?\\d+#{a_s}")
123
-
123
+
124
124
  while not parts[1].length.zero?
125
- n = parts[1][/\d+/].to_i
125
+ n = parts[1][/\d+/].to_i
126
126
  if n < pdf.settings[:max_reference_order] && last_n == -1
127
127
  last_n = n
128
128
  elsif n == last_n.next
@@ -139,12 +139,12 @@ module PdfExtract
139
139
 
140
140
  parts = parts[2].partition(Regexp.new "#{b_s}?\\d+#{a_s}")
141
141
  end
142
-
142
+
143
143
  refs << {
144
144
  :content => (current_ref + parts[0]).strip,
145
145
  :order => last_n
146
146
  }
147
-
147
+
148
148
  refs
149
149
 
150
150
  else
@@ -177,7 +177,7 @@ module PdfExtract
177
177
 
178
178
  seq_count >= pdf.settings[:min_sequence_count]
179
179
  end
180
-
180
+
181
181
  def self.include_in pdf
182
182
  pdf.spatials :references, :depends_on => [:sections] do |parser|
183
183
 
@@ -190,7 +190,7 @@ module PdfExtract
190
190
  parser.after do
191
191
  max_score = sections.map {|s| s[:reference_score]}.max
192
192
  min_permittable = max_score - (max_score * pdf.settings[:reference_flex])
193
-
193
+
194
194
  refs = []
195
195
 
196
196
  sections = sections.reject do |s|
@@ -199,13 +199,14 @@ module PdfExtract
199
199
  # half of an article.
200
200
  s[:lateness] < pdf.settings[:min_lateness] || s[:year_ratio].zero?
201
201
  end
202
-
202
+
203
203
  sections.each do |section|
204
204
  if section[:reference_score] >= min_permittable
205
205
  # TODO Enable classification once we have a reasonable model.
206
206
  #if Score.reference?(section)
207
- if numeric_sequence? pdf, Spatial.get_text_content(section)
208
- refs += split_by_delimiter pdf, Spatial.get_text_content(section)
207
+ content = Spatial.get_text_content(section)
208
+ if numeric_sequence? pdf, content
209
+ refs += split_by_delimiter pdf, content
209
210
  elsif multi_margin? section[:lines]
210
211
  refs += split_by_margin section[:lines]
211
212
  elsif multi_spacing? section[:lines]
@@ -213,7 +214,7 @@ module PdfExtract
213
214
  end
214
215
  end
215
216
  end
216
-
217
+
217
218
  # TODO Ideally we wouldn't see the ref headers here.
218
219
  # Unfortunately publication details can look a lot like references.
219
220
  refs.reject do |ref|
@@ -12,12 +12,12 @@ module PdfExtract::Resolve
12
12
  resolved = {}
13
13
  begin
14
14
  doc = Nokogiri::HTML(open url)
15
-
15
+
16
16
  result = doc.at_css "div.result"
17
17
  unless result.nil?
18
18
  score = result.at_css("span.cr_score").content.to_s
19
19
  if score.to_i >= 90
20
- doi = result.at_css "span.doi"
20
+ doi = result.at_css "span.doi"
21
21
  resolved[:doi] = doi.content.sub "http://dx.doi.org/", ""
22
22
  end
23
23
  end
@@ -25,17 +25,17 @@ module PdfExtract::Resolve
25
25
  end
26
26
  resolved
27
27
  end
28
-
28
+
29
29
  end
30
-
30
+
31
31
  class FreeCite
32
-
32
+
33
33
  def self.find ref
34
34
  Net::HTTP.start "freecite.library.brown.edu" do |http|
35
35
  r = http.post "/citations/create", "citation=#{ref}",
36
36
  "Accept" => "text/xml"
37
37
  doc = Nokogiri::XML r.body
38
-
38
+
39
39
  {
40
40
  :title => doc.at_xpath("//title").content,
41
41
  :journal => doc.at_xpath("//journal").content,
@@ -44,13 +44,13 @@ module PdfExtract::Resolve
44
44
  }
45
45
  end
46
46
  end
47
-
47
+
48
48
  end
49
-
49
+
50
50
  class SimpleTextQuery
51
51
 
52
52
  @@cookie = nil
53
-
53
+
54
54
  def self.find ref
55
55
  create_session
56
56
 
@@ -68,10 +68,10 @@ module PdfExtract::Resolve
68
68
  response = Net::HTTP.start "www.crossref.org" do |http|
69
69
  http.request post
70
70
  end
71
-
71
+
72
72
  doc = Nokogiri::HTML response.body
73
73
  doi = doc.at_css "td.resultB > a"
74
-
74
+
75
75
  if doi.nil?
76
76
  {}
77
77
  else
@@ -87,11 +87,11 @@ module PdfExtract::Resolve
87
87
  end
88
88
  end
89
89
  end
90
-
90
+
91
91
  end
92
-
92
+
93
93
  @@resolvers = [Sigg]
94
-
94
+
95
95
  def self.resolvers= resolver
96
96
  @@resolvers = resolver
97
97
  end
@@ -109,5 +109,5 @@ module PdfExtract::Resolve
109
109
  end
110
110
  ref
111
111
  end
112
-
112
+
113
113
  end
@@ -8,7 +8,7 @@ module PdfExtract
8
8
  end
9
9
 
10
10
  @@reference_model = Model.new(path_to_data("reference.model"))
11
-
11
+
12
12
  def self.reference? section
13
13
  sample = {
14
14
  1 => section[:letter_ratio],
@@ -24,13 +24,13 @@ module PdfExtract
24
24
 
25
25
  def self.merge_lines a, b, so
26
26
  so[:lines] = []
27
-
27
+
28
28
  if a.key? :lines
29
29
  so[:lines] += a[:lines]
30
30
  else
31
31
  so[:lines] << as_line(a)
32
32
  end
33
-
33
+
34
34
  if b.key? :lines
35
35
  so[:lines] += b[:lines]
36
36
  else
@@ -60,7 +60,7 @@ module PdfExtract
60
60
  so[:content] = (a[:content] + options[:separator] + b[:content])
61
61
  so[:content] = so[:content].gsub /\s+/, " "
62
62
  end
63
-
63
+
64
64
  if get_text_content(a).length > get_text_content(b).length
65
65
  so[:font] = a[:font]
66
66
  so[:line_height] = a[:line_height]
@@ -115,12 +115,12 @@ module PdfExtract
115
115
  # correct write order, specified by write_mode.
116
116
  def self.collapse objs, options={}
117
117
  options = @@default_options.merge options
118
-
118
+
119
119
  sorted = case write_mode
120
120
  when :left_to_right
121
121
  objs.sort_by { |obj| -(obj[:y].floor * 100) + (obj[:x] / 100.0) }
122
122
  end
123
-
123
+
124
124
  if sorted.count == 1
125
125
  sorted.first.dup
126
126
  else
@@ -132,18 +132,18 @@ module PdfExtract
132
132
  end
133
133
  end
134
134
 
135
- def self.contains? a, b
136
- a_x1 = a[:x]
137
- a_x2 = a[:x] + a[:width]
138
- a_y1 = a[:y]
139
- a_y2 = a[:y] + a[:height]
135
+ def self.contains? a, b, padding=0
136
+ a_x1 = a[:x] - padding
137
+ a_x2 = a[:x] + a[:width] + (padding * 2)
138
+ a_y1 = a[:y] - padding
139
+ a_y2 = a[:y] + a[:height] + (padding * 2)
140
140
 
141
141
  b_x1 = b[:x]
142
142
  b_x2 = b[:x] + b[:width]
143
143
  b_y1 = b[:y]
144
144
  b_y2 = b[:y] + b[:height]
145
145
 
146
- b_x1 >= a_x1 && b_x2 <= a_x2 && b_y1 >= a_y1 && b_y2 <= a_y2
146
+ b_x1 >= a_x1 && b_x2 <= a_x2 && b_y1 >= a_y1 && b_y2 <= a_y2
147
147
  end
148
148
 
149
149
  def self.overlap? from, by, a, b
@@ -158,7 +158,7 @@ module PdfExtract
158
158
  diffs = items.map {|item| (item[f] - ideals[f][0]).abs}
159
159
  diffs.map! {|d| d.nan? ? 1 : d}
160
160
  max_diff = diffs.max
161
-
161
+
162
162
  scores = diffs.map do |d|
163
163
  if d == 0
164
164
  ideals[f][1]
@@ -173,6 +173,6 @@ module PdfExtract
173
173
  end
174
174
  end
175
175
  end
176
-
176
+
177
177
  end
178
178
  end