pdf-extract 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/analysis/sections.rb +3 -2
- data/lib/font_metrics.rb +8 -2
- data/lib/names.rb +1 -1
- data/lib/references/references.rb +1 -1
- data/lib/spatial.rb +8 -2
- metadata +2 -2
data/lib/analysis/sections.rb
CHANGED
@@ -133,9 +133,10 @@ module PdfExtract
|
|
133
133
|
# Score sections into categories based on their textual attributes.
|
134
134
|
ideals = {
|
135
135
|
:reference => {
|
136
|
-
:name_ratio => [0.2,
|
136
|
+
:name_ratio => [0.2, 2],
|
137
137
|
:letter_ratio => [0.25, 2],
|
138
|
-
:year_ratio => [0.05, 7]
|
138
|
+
:year_ratio => [0.05, 7],
|
139
|
+
:cap_ratio => [0.5, 5]
|
139
140
|
},
|
140
141
|
:body => {
|
141
142
|
:name_ratio => [0.03, 1],
|
data/lib/font_metrics.rb
CHANGED
@@ -26,6 +26,10 @@ module PdfExtract
|
|
26
26
|
attr_accessor :ascent, :descent, :bbox
|
27
27
|
|
28
28
|
def initialize font
|
29
|
+
@ascent = 0
|
30
|
+
@descent = 0
|
31
|
+
@bbox = [0, 0, 0, 0]
|
32
|
+
|
29
33
|
base_font = font.basefont.to_s
|
30
34
|
if @@base_fonts.key? base_font
|
31
35
|
@ascent = @@base_fonts[base_font][:Ascent]
|
@@ -41,8 +45,10 @@ module PdfExtract
|
|
41
45
|
@glyph_width_lookup = proc { |c| font.glyph_width c }
|
42
46
|
end
|
43
47
|
|
44
|
-
|
45
|
-
|
48
|
+
if not @bbox.nil?
|
49
|
+
@ascent = @bbox[3] if @ascent.nil? || @ascent.zero?
|
50
|
+
@descent = @bbox[1] if @descent.nil? || @descent.zero?
|
51
|
+
end
|
46
52
|
end
|
47
53
|
|
48
54
|
def glyph_width c
|
data/lib/names.rb
CHANGED
data/lib/spatial.rb
CHANGED
@@ -165,9 +165,15 @@ module PdfExtract
|
|
165
165
|
items.each do |item|
|
166
166
|
diff = (item[var_name] - ideals[name][var_name][0]).abs
|
167
167
|
if diff.zero?
|
168
|
-
|
168
|
+
scores << 1.0
|
169
|
+
else
|
170
|
+
s = 1.0 / diff
|
171
|
+
if not s.finite?
|
172
|
+
scores << 0.0
|
173
|
+
else
|
174
|
+
scores << s
|
175
|
+
end
|
169
176
|
end
|
170
|
-
scores << 1.0 / diff
|
171
177
|
end
|
172
178
|
|
173
179
|
score_max = scores.max
|