pdf-extract 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -133,9 +133,10 @@ module PdfExtract
133
133
  # Score sections into categories based on their textual attributes.
134
134
  ideals = {
135
135
  :reference => {
136
- :name_ratio => [0.2, 5],
136
+ :name_ratio => [0.2, 2],
137
137
  :letter_ratio => [0.25, 2],
138
- :year_ratio => [0.05, 7]
138
+ :year_ratio => [0.05, 7],
139
+ :cap_ratio => [0.5, 5]
139
140
  },
140
141
  :body => {
141
142
  :name_ratio => [0.03, 1],
data/lib/font_metrics.rb CHANGED
@@ -26,6 +26,10 @@ module PdfExtract
26
26
  attr_accessor :ascent, :descent, :bbox
27
27
 
28
28
  def initialize font
29
+ @ascent = 0
30
+ @descent = 0
31
+ @bbox = [0, 0, 0, 0]
32
+
29
33
  base_font = font.basefont.to_s
30
34
  if @@base_fonts.key? base_font
31
35
  @ascent = @@base_fonts[base_font][:Ascent]
@@ -41,8 +45,10 @@ module PdfExtract
41
45
  @glyph_width_lookup = proc { |c| font.glyph_width c }
42
46
  end
43
47
 
44
- @ascent = @bbox[3] if @ascent.zero?
45
- @descent = @bbox[1] if @descent.zero?
48
+ if not @bbox.nil?
49
+ @ascent = @bbox[3] if @ascent.nil? || @ascent.zero?
50
+ @descent = @bbox[1] if @descent.nil? || @descent.zero?
51
+ end
46
52
  end
47
53
 
48
54
  def glyph_width c
data/lib/names.rb CHANGED
@@ -7,7 +7,7 @@ require_relative "pdf-extract"
7
7
  module PdfExtract::Names
8
8
 
9
9
  class NamesDatabase
10
- @@ambiguous_weighting = 0.1
10
+ @@ambiguous_weighting = 0.0
11
11
  @@unambiguous_weighting = 1.0
12
12
 
13
13
  def self.path_to_data data_filename
@@ -3,7 +3,7 @@ require_relative "../spatial"
3
3
  module PdfExtract
4
4
  module References
5
5
 
6
- Settings.default :min_score, 6.4
6
+ Settings.default :min_score, 8.0
7
7
  Settings.default :min_sequence_count, 3
8
8
  Settings.default :max_reference_order, 1000
9
9
 
data/lib/spatial.rb CHANGED
@@ -165,9 +165,15 @@ module PdfExtract
165
165
  items.each do |item|
166
166
  diff = (item[var_name] - ideals[name][var_name][0]).abs
167
167
  if diff.zero?
168
- diff = Float::MIN
168
+ scores << 1.0
169
+ else
170
+ s = 1.0 / diff
171
+ if not s.finite?
172
+ scores << 0.0
173
+ else
174
+ scores << s
175
+ end
169
176
  end
170
- scores << 1.0 / diff
171
177
  end
172
178
 
173
179
  score_max = scores.max
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 3
9
- version: 0.0.3
8
+ - 4
9
+ version: 0.0.4
10
10
  platform: ruby
11
11
  authors:
12
12
  - Karl Jonathan Ward