pdf-extract 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,136 @@
1
+ svm_type c_svc
2
+ kernel_type rbf
3
+ gamma 8
4
+ nr_class 2
5
+ total_sv 127
6
+ rho -0.961432
7
+ label 0 1
8
+ nr_sv 90 37
9
+ SV
10
+ 0.01857436364550136 2:-1 3:-0.809524 4:-1 5:0.065
11
+ 0.01937561173990551 1:-1 2:-1 3:-1 4:-1 5:-0.9312
12
+ 0.0160910161902079 1:1 2:-1 3:-1 4:-1 5:0.15
13
+ 0.01400900681690667 1:0.761194 2:-0.777778 3:-1 4:0.725709 5:-0.088
14
+ 0.03032223844259175 1:-0.183673 2:-1 3:-1 4:-0.821478 5:-0.0426667
15
+ 0.02126950063232856 1:-0.743902 2:-0.391304 3:-1 4:-0.813717 5:-0.768
16
+ 0.02113478553748108 1:0.47619 2:-1 3:-1 4:-1 5:0.0933333
17
+ 0.01823561274250169 1:0.737705 2:-0.777778 3:-1 4:0.547187 5:-0.224
18
+ 0.09700250121827236 1:-0.809524 2:-0.52381 3:-1 4:-0.885236 5:0.32
19
+ 0.01766030634427132 1:-0.788079 2:-0.727273 3:-1 4:-0.866109 5:0.258182
20
+ 0.02997117793994787 1:-0.428571 2:-1 3:0.333333 4:-0.866109 5:-0.942857
21
+ 0.03160597346787591 1:-0.533981 2:-0.363636 3:-1 4:-0.878281 5:-0.96
22
+ 0.01883478571632254 1:1 2:-1 3:-1 4:-1 5:0.32
23
+ 0.06000320619142035 1:-0.548023 2:-0.25 3:-0.666667 4:-0.788006 5:-0.236364
24
+ 0.007004876856992319 1:-0.666667 2:-0.333333 3:-1 4:-1 5:-0.669091
25
+ 0.03827464518754259 1:-1 2:1 3:-1 4:-1 5:-0.96
26
+ 0.02726286733562099 1:1 2:-1 3:-1 4:-1 5:-0.96
27
+ 0.125 1:-0.696203 2:-0.333333 3:-0.666667 4:-0.888424 5:0.229333
28
+ 0.03344964665575857 1:0.800784 2:-1 3:-1 4:1 5:0.08
29
+ 0.03128222505105369 1:0.630952 2:-1 3:-1 4:-0.439331 5:0.08
30
+ 0.125 1:-0.575758 2:-0.666667 3:-1 4:-0.776848 5:-1.11022e-16
31
+ 0.125 1:-0.738149 2:-0.770642 3:-0.963303 4:-0.891904 5:0.16
32
+ 0.01250576564390459 1:-0.957447 2:-0.492958 3:-1 4:-0.996228 5:-0.9856
33
+ 0.03217406791268479 1:-0.00854701 2:-1 3:-1 4:-0.330544 5:-1.11022e-16
34
+ 0.02480526634234231 1:-0.971831 2:0.0909091 3:-1 4:-1 5:-0.133333
35
+ 0.01096993112524694 1:-1 2:-0.333333 3:-1 4:-1 5:-0.496
36
+ 0.012887019582354 1:-0.714286 3:-1 4:-1 5:-0.405333
37
+ 0.003331701548896184 1:-0.965812 2:-0.647059 3:-1 4:-1 5:-0.9856
38
+ 0.005803393610819437 1:-0.769231 2:-1 3:-1 4:-0.821478 5:-0.8224
39
+ 0.006890344962442365 1:-0.818182 2:-0.793103 3:-0.586207 4:-0.981532 5:-0.96
40
+ 0.125 1:-0.777778 2:-0.6 3:-0.84 4:-0.946444 5:0.125714
41
+ 0.001142586695743055 1:-0.705263 2:-0.466667 3:-1 4:-0.946444 5:-0.87
42
+ 0.01168005213986096 1:-0.605263 2:-0.333333 3:-1 4:-0.754533 5:-0.87
43
+ 0.125 1:-0.410959 2:-0.777778 3:-0.333333 4:-0.821478 5:0.32
44
+ 0.125 1:-0.327869 2:-0.969697 3:-0.69697 4:-0.906682 5:0.184
45
+ 0.00677461796062185 1:-0.537037 2:-0.733333 3:-0.733333 4:-0.821478 5:-0.935385
46
+ 0.001379215298278784 1:-0.748571 2:-0.652174 3:-1 4:-0.860287 5:-1
47
+ 0.0219356287285079 1:0.378378 2:-1 3:-1 4:-1 5:-0.68
48
+ 0.02157538481319524 1:-0.388889 2:-1 4:-0.888424 5:-1
49
+ 0.03357748970377396 2:-0.684211 3:-0.368421 4:-0.802687 5:-1
50
+ 0.003894873430265556 1:-0.958779 2:-0.537688 3:-1 4:-0.997309 5:-0.9856
51
+ 0.02545507275489556 1:1 2:-1 3:-1 4:-1 5:-0.632
52
+ 0.03899336388799851 1:0.0447761 2:-1 3:1 4:-1 5:-0.088
53
+ 0.03208662393251911 1:0.866667 2:-1 3:-1 4:0.472803 5:-0.516923
54
+ 0.004986448465924906 1:-1 2:-0.666667 3:-1 4:-1 5:-0.632
55
+ 0.009054249612984358 1:-0.707483 2:-0.487179 3:-1 4:-0.766549 5:-0.975238
56
+ 0.125 1:-0.130435 2:-1 3:-1 4:-0.933054 5:0.2656
57
+ 0.00932561307323757 1:-1 2:-1 3:-1 4:-1 5:-0.7
58
+ 0.125 1:-0.167382 2:-0.755102 3:-0.510204 4:-0.912561 5:0.048
59
+ 0.01366866452122291 1:-0.773913 2:-0.428571 3:-0.714286 4:-1 5:-0.942857
60
+ 0.125 1:-0.610811 2:-0.6 3:-0.84 4:-0.860753 5:-0.133333
61
+ 0.03093352314049818 1:-0.603175 2:-0.571429 3:-1 4:-0.751345 5:-0.813333
62
+ 0.003235639174723532 1:0.687075 2:-1 3:-1 4:-0.788006 5:0.32
63
+ 0.125 1:-0.369574 2:-0.727969 3:-0.877395 4:-0.944084 5:-0.224
64
+ 0.03304675897033391 1:-0.79562 2:0.263158 3:-1 4:-0.81678 5:-0.96
65
+ 0.04097040264957937 1:-0.760075 2:-0.533679 3:-0.937824 4:-0.968088 5:0.184
66
+ 0.125 1:-0.807692 2:-0.502762 3:-0.933702 4:-0.928986 5:0.32
67
+ 0.03861958220096956 1:0.857143 3:-1 4:0.606695 5:-0.949333
68
+ 0.0241583069877076 1:-0.259259 2:-1 3:-0.333333 4:-0.95537 5:-0.6592
69
+ 0.0246536887936159 1:-0.579439 2:-0.612903 3:-1 4:-0.714941 5:-0.942857
70
+ 0.125 1:-0.567568 2:-0.6 3:-1 4:-0.973222 5:0.215385
71
+ 0.125 1:-0.661538 2:-0.714286 3:-1 4:-0.942618 5:0.215385
72
+ 0.00561316195289121 1:-0.953488 2:-0.333333 3:-1 4:-1 5:-0.87
73
+ 0.0113461854574554 1:0.777778 2:-1 3:-1 4:-0.732218 5:0.32
74
+ 0.125 1:-0.59322 2:-0.733333 3:-1 4:-0.964296 5:0.215385
75
+ 0.125 1:-0.59633 2:-0.703704 3:-1 4:-0.960329 5:0.215385
76
+ 0.125 1:-0.529412 2:-1 3:-1 4:-0.732218 5:0.215385
77
+ 0.125 1:-0.705882 2:-0.791667 3:-1 4:-0.760112 5:0.32
78
+ 0.0291523348938202 1:-0.677852 2:-0.44 3:-1 4:-0.817908 5:-0.845714
79
+ 0.02288122491162361 1:0.244681 2:-0.944444 3:-0.277778 4:-0.90702 5:-0.53
80
+ 0.04322752767081517 1:-0.924961 2:-0.437372 3:-0.975359 4:-0.979655 5:1
81
+ 0.125 1:-0.737693 2:-0.674757 3:-0.951456 4:-0.899257 5:0.16
82
+ 0.1240880203368809 1:-0.676898 2:-0.781548 3:-1 4:-0.963084 5:0.138667
83
+ 0.125 1:-0.63222 2:-0.70979 3:-1 4:-0.956696 5:0.229333
84
+ 0.125 1:-0.79017 2:-0.55 3:-0.833333 4:-0.968387 5:0.12
85
+ 0.02530601224001184 1:-0.868733 2:-0.587463 3:-0.935469 4:-0.96655 5:0.32
86
+ 0.006671484160950634 1:-0.873016 2:-0.0588235 3:-1 4:-0.889737 5:-0.904
87
+ 0.125 1:-0.610811 2:-0.6 3:-0.84 4:-0.860753 5:0.32
88
+ 0.01570854122336294 1:-0.781818 2:-0.75 3:-0.5 4:-0.966527 5:-0.8768
89
+ 0.0836137287825839 1:-0.681159 2:-0.833333 3:-0.666667 4:-0.95537 5:-0.3328
90
+ 0.125 1:-0.681159 2:-0.833333 3:-0.666667 4:-0.95537 5:0.2112
91
+ 0.03714202104050296 1:-0.769231 2:-1 3:-1 4:-0.821478 5:0.2656
92
+ 0.125 1:-0.681159 2:-0.833333 3:-0.666667 4:-0.95537 5:0.32
93
+ 0.01857206694116549 1:0.322835 2:-1 3:-0.5 4:-0.944212 5:-0.36
94
+ 0.03075145032456461 1:-0.129496 2:-0.853659 3:0.0731707 4:-0.941218 5:-0.088
95
+ 0.00632380603735933 1:-0.695652 2:-1 3:-1 4:-0.948994 5:-0.8
96
+ 0.003936231589231347 1:-0.575758 2:-0.6 3:-1 4:-0.732218 5:-0.792727
97
+ 0.01447392509447366 1:0.111111 2:-1 3:-1 4:-0.785774 5:-0.53
98
+ 0.06198936228686536 1:-0.6 2:-1 3:-1 4:-1 5:0.0933333
99
+ 0.01295834392175865 1:0.142857 2:-1 3:-1 4:-1 5:-0.7
100
+ -0.125 1:-0.51054 2:-0.574586 3:-0.78453 4:-0.897177 5:0.32
101
+ -0.125 1:-0.474211 2:-0.787234 3:-0.730496 4:-0.84047 5:0.048
102
+ -0.125 1:-0.716069 2:-0.592233 3:-0.883495 4:-0.953203 5:0.229333
103
+ -0.125 1:-0.843511 2:-0.642857 3:-0.904762 4:-0.968121 5:0.32
104
+ -0.125 1:-0.483092 2:-0.75 3:-0.75 4:-0.832636 5:0.168889
105
+ -0.125 1:-0.361538 2:-0.74359 3:-0.74359 4:-0.876408 5:0.32
106
+ -0.125 1:-0.547792 2:-0.690789 3:-0.828947 4:-0.880203 5:0.32
107
+ -0.02063505424396911 1:-0.505391 2:-0.757848 3:-0.820628 4:-0.865508 5:0.32
108
+ -0.125 1:-0.48954 2:-0.670251 3:-0.784946 4:-0.890584 5:0.184
109
+ -0.125 1:-0.393393 2:-0.821429 3:-0.785714 4:-0.890018 5:0.32
110
+ -0.125 1:-0.552613 2:-0.604061 3:-0.796954 4:-0.895334 5:0.32
111
+ -0.125 1:-0.56341 2:-0.704762 3:-0.809524 4:-0.848257 5:0.32
112
+ -0.125 1:-0.543767 2:-0.743119 3:-0.832241 4:-0.880674 5:0.32
113
+ -0.125 1:-0.494214 2:-0.720183 3:-0.724771 4:-0.851983 5:0.16
114
+ -0.125 1:-0.566851 2:-0.662269 3:-0.82058 4:-0.842439 5:0.32
115
+ -0.125 1:-0.521058 2:-0.801829 3:-0.823171 4:-0.853454 5:0.32
116
+ -0.125 1:-0.464121 2:-0.779661 3:-0.737288 4:-0.838877 5:0.424615
117
+ -0.125 1:-0.476154 2:-0.825137 3:-0.759563 4:-0.847818 5:0.456
118
+ -0.125 1:-0.515772 2:-0.833333 3:-0.583333 4:-0.901441 5:0.32
119
+ -0.125 1:-0.527421 2:-0.730413 3:-0.814659 4:-0.894421 5:0.125714
120
+ -0.125 1:-0.524205 2:-0.818616 3:-0.809069 4:-0.880489 5:0.32
121
+ -0.125 1:-0.548023 2:-0.787234 3:-0.829787 4:-0.897445 5:0.125714
122
+ -0.125 1:-0.419355 2:-0.839286 3:-0.5 4:-0.856545 5:0.32
123
+ -0.125 1:-0.515738 2:-0.72093 3:-0.661734 4:-0.899228 5:0.0285714
124
+ -0.125 1:-0.537209 2:-0.811024 3:-0.811024 4:-0.877706 5:0.32
125
+ -0.125 1:-0.512427 2:-0.820175 3:-0.561404 4:-0.90839 5:0.125714
126
+ -0.125 1:-0.566514 2:-0.734375 3:-0.78125 4:-0.851464 5:0.32
127
+ -0.125 1:-0.581746 2:-0.670455 3:-0.818182 4:-0.893496 5:0.32
128
+ -0.125 1:-0.501896 2:-0.760825 3:-0.802062 4:-0.864176 5:0.529231
129
+ -0.125 1:-0.658537 2:-0.481481 3:-1 4:-0.821478 5:-0.845714
130
+ -0.125 1:-0.728738 2:-0.552426 3:-0.862285 4:-0.92373 5:0.32
131
+ -0.125 1:-0.554046 2:-0.572118 3:-0.789812 4:-0.859719 5:0.32
132
+ -0.125 1:-0.515097 2:-0.566098 3:-0.855011 4:-0.830994 5:0.32
133
+ -0.125 1:-0.603835 2:-0.577181 3:-0.704698 4:-0.875993 5:-0.174545
134
+ -0.125 1:-0.57637 2:-0.550278 3:-0.777665 4:-0.875919 5:0.32
135
+ -0.125 1:-0.496503 2:-0.826087 3:-0.826087 4:-0.930144 5:0.32
136
+ -0.125 1:-0.469146 2:-0.714912 3:-0.640351 4:-0.897233 5:0.222857
data/lib/#language.rb# ADDED
@@ -0,0 +1,66 @@
1
+ require_relative "names"
2
+
3
+ module PdfExtract::Language
4
+
5
+ def self.transliterate s
6
+ s = s.gsub "\ufb01", "fi"
7
+ s = s.gsub "\ufb02", "fl"
8
+ s = s.gsub "\ufb03", "ffi"
9
+ s = s.gsub "\ufb04", "ffl"
10
+ s = s.gsub "\ufb06", "st"
11
+ s = s.gsub "\u2018", "'"
12
+ s = s.gsub "\u2019", "'"
13
+ s = s.gsub "\u2013", "-"
14
+ s = s.gsub "\u2014", "-"
15
+ s = s.gsub "\u201c", "\""
16
+ s = s.gsub "\u201d", "\""
17
+ s = s.gsub "\u25af", "("
18
+ s = s.gsub "\u00b4", ""
19
+ s = s.gsub "\u00b1", "-"
20
+
21
+ s = s.gsub /\s+/, " "
22
+ end
23
+
24
+ def self.letter_ratio s
25
+ s.count("A-Z0-9\-[],.\"'()") / s.length.to_f
26
+ end
27
+
28
+ # TODO Ignore caps in middle of words
29
+ def self.cap_ratio s
30
+ sentence_end = true
31
+ cap_count = 0
32
+
33
+ s.each_char do |c|
34
+ if c =~ /\./
35
+ sentence_end = true
36
+ elsif c =~ /[A-Z]/
37
+ cap_count = cap_count + 1 unless sentence_end
38
+ sentence_end = false
39
+ elsif c =~ /[^\s]/
40
+ sentence_end = false
41
+ end
42
+ end
43
+
44
+ cap_count / s.split.length.to_f
45
+ end
46
+
47
+ def self.year_ratio s
48
+ words = s.split
49
+
50
+ year_words = words.map do |word|
51
+ word =~ /[^\d]\d{4}[^\d]/
52
+ end
53
+
54
+ year_words.reject { |year_word| not year_word }.length / words.length.to_f
55
+ end
56
+
57
+ def self.name_ratio content
58
+ PdfExtract::Names.detect_names(content)[:name_frequency]
59
+ end
60
+
61
+ def self.word_count s
62
+ s.split.count
63
+ end
64
+
65
+ end
66
+
@@ -138,21 +138,15 @@ module PdfExtract
138
138
  sections = add_content_stats sections, pages.keys.count
139
139
 
140
140
  # Score sections into categories based on their textual attributes.
141
- ideals = {
142
- :reference => {
143
- :name_ratio => [0.2, 2],
144
- :letter_ratio => [0.25, 2],
145
- :year_ratio => [0.05, 7],
146
- :cap_ratio => [0.5, 5]
147
- },
148
- :body => {
149
- :name_ratio => [0.03, 1],
150
- :letter_ratio => [0.1, 1],
151
- :year_ratio => [0.0, 1]
152
- }
141
+ ref_ideals = {
142
+ :name_ratio => [0.14, 1],
143
+ :letter_ratio => [0.23, 6],
144
+ :year_ratio => [0.05, 10],
145
+ :cap_ratio => [0.49, 10],
146
+ :lateness => [0.96, 6]
153
147
  }
154
148
 
155
- Spatial.score(sections, ideals)
149
+ Spatial.score(sections, ref_ideals, :reference_score)
156
150
 
157
151
  sections
158
152
  end
data/lib/language.rb CHANGED
@@ -5,12 +5,21 @@ module PdfExtract::Language
5
5
  def self.transliterate s
6
6
  s = s.gsub "\ufb01", "fi"
7
7
  s = s.gsub "\ufb02", "fl"
8
+ s = s.gsub "\ufb03", "ffi"
9
+ s = s.gsub "\ufb04", "ffl"
10
+ s = s.gsub "\ufb06", "st"
8
11
  s = s.gsub "\u2018", "'"
9
12
  s = s.gsub "\u2019", "'"
10
13
  s = s.gsub "\u2013", "-"
14
+ s = s.gsub "\u2014", "-"
11
15
  s = s.gsub "\u201c", "\""
12
16
  s = s.gsub "\u201d", "\""
13
- s
17
+ s = s.gsub "\u25af", "("
18
+ s = s.gsub "\u00b4", ""
19
+ s = s.gsub "\u00b1", "-"
20
+
21
+
22
+ s = s.gsub /\s+/, " "
14
23
  end
15
24
 
16
25
  def self.letter_ratio s
@@ -40,7 +49,7 @@ module PdfExtract::Language
40
49
  words = s.split
41
50
 
42
51
  year_words = words.map do |word|
43
- word =~ /\.*\d{4}\.*/
52
+ word =~ /[^\d]\d{4}[^\d]/
44
53
  end
45
54
 
46
55
  year_words.reject { |year_word| not year_word }.length / words.length.to_f
@@ -49,8 +49,17 @@ module PdfExtract
49
49
  end
50
50
  end
51
51
 
52
+ def self.find_media_box page, objects
53
+ if page[:MediaBox]
54
+ page[:MediaBox]
55
+ elsif page[:Parent]
56
+ find_media_box objects[page[:Parent]], objects
57
+ else
58
+ [0, 0, 0, 0]
59
+ end
60
+ end
61
+
52
62
  def self.make_text_runs text, tj, state, render_state, page, page_number
53
- # TODO Ignore chars outside the page :MediaBox.
54
63
  # TODO Mul UserUnit if specified by page.
55
64
  # TODO Include writing mode, so that runs can be joined either
56
65
  # virtically or horizontally in the join stage.
@@ -87,6 +96,8 @@ module PdfExtract
87
96
 
88
97
  px = bl_pos.row(0)[0]
89
98
  py = bl_pos.row(0)[1]
99
+
100
+ media_box = find_media_box(page.page_object, page.objects)
90
101
 
91
102
  objs << {
92
103
  :x => px,
@@ -97,8 +108,8 @@ module PdfExtract
97
108
  :content => state.last[:font].to_utf8(c),
98
109
  :page => page_number,
99
110
  :font => state.last[:font].basefont,
100
- :page_width => page[:MediaBox][2] - page[:MediaBox][0],
101
- :page_height => page[:MediaBox][3] - page[:MediaBox][1]
111
+ :page_width => media_box[2] - media_box[0],
112
+ :page_height => media_box[3] - media_box[1]
102
113
  }
103
114
 
104
115
  disp_x, disp_y = glyph_displacement(c, state)
@@ -114,6 +125,17 @@ module PdfExtract
114
125
  objs
115
126
  end
116
127
 
128
+ def self.build_fonts page
129
+ fonts = {}
130
+ font_metrics = {}
131
+ page.fonts.each do |label, ref|
132
+ font = PDF::Reader::Font.new(page.objects, page.objects[ref])
133
+ fonts[label] = font
134
+ font_metrics[label] = FontMetrics.new font
135
+ end
136
+ [fonts, font_metrics]
137
+ end
138
+
117
139
  def self.include_in pdf
118
140
 
119
141
  pdf.spatials :characters do |parser|
@@ -127,15 +149,19 @@ module PdfExtract
127
149
  :tlm => Matrix.identity(3)
128
150
  }
129
151
 
130
- parser.for :resource_font do |data|
131
- fonts[data[0]] = data[1]
132
- font_metrics[data[0]] = FontMetrics.new data[1]
133
- nil
134
- end
152
+ # parser.for :resource_font do |data|
153
+ # puts data
154
+ # fonts[data[0]] = data[1]
155
+ # font_metrics[data[0]] = FontMetrics.new data[1]
156
+ # nil
157
+ # end
135
158
 
136
159
  parser.for :begin_page do |data|
137
160
  page = data[0]
138
161
  page_n = page_n.next
162
+
163
+ fonts, font_metrics = build_fonts page
164
+
139
165
  state << {
140
166
  :h_scale => 100,
141
167
  :char_spacing => 0,
@@ -270,7 +296,7 @@ module PdfExtract
270
296
 
271
297
  # Show text operators.
272
298
 
273
- parser.for :set_spacing_next_line_show_text_raw do |data|
299
+ parser.for :set_spacing_next_line_show_text do |data|
274
300
  state.last[:word_spacing] = data[0]
275
301
  state.last[:char_spacing] = data[1]
276
302
 
@@ -282,7 +308,7 @@ module PdfExtract
282
308
  make_text_runs data[2], 0, state, render_state, page, page_n
283
309
  end
284
310
 
285
- parser.for :move_to_next_line_and_show_text_raw do |data|
311
+ parser.for :move_to_next_line_and_show_text do |data|
286
312
  render_state[:tm] = Matrix[
287
313
  [1, 0, 0], [0, 1, 0], [0, -state.last[:leading], 1]
288
314
  ] * render_state[:tlm]
@@ -291,11 +317,11 @@ module PdfExtract
291
317
  make_text_runs data.first, 0, state, render_state, page, page_n
292
318
  end
293
319
 
294
- parser.for :show_text_raw do |data|
320
+ parser.for :show_text do |data|
295
321
  make_text_runs data.first, 0, state, render_state, page, page_n
296
322
  end
297
323
 
298
- parser.for :show_text_with_positioning_raw do |data|
324
+ parser.for :show_text_with_positioning do |data|
299
325
  data = data.first
300
326
  runs = []
301
327
  tj = 0
data/lib/pdf.rb CHANGED
@@ -129,9 +129,9 @@ module PdfExtract
129
129
  end
130
130
 
131
131
  paged_objs.each_pair do |page, objs|
132
- self.call_before
132
+ call_before
133
133
 
134
- if self.object_calls?
134
+ if object_calls?
135
135
  @object_listeners.each_pair do |type, listeners|
136
136
  listeners.each do |listener|
137
137
  if objs[type].nil?
@@ -142,22 +142,29 @@ module PdfExtract
142
142
  end
143
143
  end
144
144
 
145
- self.call_after
145
+ call_after
146
146
  end
147
147
 
148
148
  else
149
149
 
150
- self.call_before
151
- if self.object_calls?
152
- self.call_object_listeners @pdf.spatial_objects
150
+ call_before
151
+ if object_calls?
152
+ call_object_listeners @pdf.spatial_objects
153
153
  end
154
- self.call_after
154
+ call_after
155
155
 
156
156
  end
157
157
 
158
- if self.for_calls?
159
- self.expand_listeners_to_callback_methods
160
- PDF::Reader.file filename, self, :raw_text => true
158
+ if for_calls?
159
+ expand_listeners_to_callback_methods
160
+ #PDF::Reader.file filename, self, :raw_text => true
161
+
162
+ reader = PDF::Reader.new filename, :raw_text => true
163
+ reader.pages.each do |page|
164
+ begin_page page
165
+ page.walk self
166
+ end_page page
167
+ end
161
168
  end
162
169
  end
163
170
 
@@ -1,11 +1,14 @@
1
+ # -*- coding: utf-8 -*-
1
2
  require_relative "../spatial"
3
+ require_relative "score"
2
4
 
3
5
  module PdfExtract
4
6
  module References
5
-
6
- Settings.default :min_score, 8.0
7
+
8
+ Settings.default :reference_flex, 0.1
7
9
  Settings.default :min_sequence_count, 3
8
10
  Settings.default :max_reference_order, 1000
11
+ Settings.default :min_lateness , 0.5
9
12
 
10
13
  def self.partition_by ary, &block
11
14
  matching = []
@@ -159,23 +162,44 @@ module PdfExtract
159
162
  def self.include_in pdf
160
163
  pdf.spatials :references, :depends_on => [:sections] do |parser|
161
164
 
162
- refs = []
165
+ sections = []
163
166
 
164
167
  parser.objects :sections do |section|
165
- # TODO Take top x%, fix Infinity coming back from score.
166
- if section[:reference_score] >= pdf.settings[:min_score]
167
- if numeric_sequence? pdf, Spatial.get_text_content(section)
168
- refs += split_by_delimiter pdf, Spatial.get_text_content(section)
169
- elsif multi_margin? section[:lines]
170
- refs += split_by_margin section[:lines]
171
- elsif multi_spacing? section[:lines]
172
- refs += split_by_line_spacing section[:lines]
173
- end
174
- end
168
+ sections << section
175
169
  end
176
170
 
177
171
  parser.after do
178
- refs
172
+ max_score = sections.map {|s| s[:reference_score]}.max
173
+ min_permittable = max_score - (max_score * pdf.settings[:reference_flex])
174
+
175
+ refs = []
176
+
177
+ sections = sections.reject do |s|
178
+ # A section without any years is definitely not a list of
179
+ # references. So too a section that appears in the first
180
+ # half of an article.
181
+ s[:lateness] < pdf.settings[:min_lateness] || s[:year_ratio].zero?
182
+ end
183
+
184
+ sections.each do |section|
185
+ if section[:reference_score] >= min_permittable
186
+ if numeric_sequence? pdf, Spatial.get_text_content(section)
187
+ refs += split_by_delimiter pdf, Spatial.get_text_content(section)
188
+ elsif multi_margin? section[:lines]
189
+ refs += split_by_margin section[:lines]
190
+ elsif multi_spacing? section[:lines]
191
+ refs += split_by_line_spacing section[:lines]
192
+ end
193
+ end
194
+ end
195
+
196
+ # TODO Ideally we wouldn't see the ref headers here.
197
+ # Unfortunately publication details can look a lot like references.
198
+ refs.reject do |ref|
199
+ norm = ref[:content].downcase.strip
200
+ norm =~ /references?/ || norm =~ /submitted for publication/ || norm =~ /additional contributions/
201
+ end
202
+
179
203
  end
180
204
 
181
205
  end
@@ -0,0 +1,28 @@
1
+ require "svm"
2
+
3
+ module PdfExtract
4
+ module Score
5
+
6
+ def self.path_to_data data_filename
7
+ File.join(File.dirname(File.expand_path(__FILE__)), "../../data/" + data_filename)
8
+ end
9
+
10
+ @@reference_model = Model.new(path_to_data("reference.model"))
11
+
12
+ def self.reference? section
13
+ sample = {
14
+ 1 => section[:letter_ratio],
15
+ 2 => section[:name_ratio],
16
+ 3 => section[:year_ratio],
17
+ 4 => section[:cap_ratio],
18
+ 5 => section[:lateness]
19
+ }
20
+
21
+ puts sample
22
+
23
+ puts @@reference_model.predict(sample)
24
+ @@reference_model.predict(sample) == 1
25
+ end
26
+
27
+ end
28
+ end
data/lib/spatial.rb CHANGED
@@ -153,42 +153,23 @@ module PdfExtract
153
153
  (b_top <= a_top && b_top >= a[from]) || (b[from] >= a[from] && b[from] <= b_top)
154
154
  end
155
155
 
156
- def self.score items, ideals
157
- types = {}
158
- ideals.keys.each do |name|
159
- types[name] = ideals[name].keys
160
- end
161
-
162
- types.each do |name, vars|
163
- score_name = (name.to_s + "_score").to_sym
156
+ def self.score items, ideals, name
157
+ ideals.keys.each do |f|
158
+ diffs = items.map {|item| (item[f] - ideals[f][0]).abs}
159
+ diffs.map! {|d| d.nan? ? 1 : d}
160
+ max_diff = diffs.max
164
161
 
165
- vars.each do |var_name|
166
-
167
- scores = []
168
- items.each do |item|
169
- diff = (item[var_name] - ideals[name][var_name][0]).abs
170
- if diff.zero?
171
- scores << 1.0
172
- else
173
- s = 1.0 / diff
174
- if not s.finite?
175
- scores << 0.0
176
- else
177
- scores << s
178
- end
179
- end
180
- end
181
-
182
- score_max = scores.max
183
- weighted_scores = scores.map do |score|
184
- (score / score_max) * ideals[name][var_name][1]
162
+ scores = diffs.map do |d|
163
+ if d == 0
164
+ ideals[f][1]
165
+ else
166
+ (1 - (d / max_diff)) * ideals[f][1]
185
167
  end
168
+ end
186
169
 
187
- items.each_index do |idx|
188
- items[idx][score_name] ||= 0.0
189
- items[idx][score_name] += weighted_scores[idx]
190
- end
191
-
170
+ items.each_index do |i|
171
+ items[i][name] ||= 0
172
+ items[i][name] = items[i][name] + scores[i]
192
173
  end
193
174
  end
194
175
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 6
9
- version: 0.0.6
8
+ - 7
9
+ version: 0.0.7
10
10
  platform: ruby
11
11
  authors:
12
12
  - Karl Jonathan Ward
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-11-02 00:00:00 +00:00
17
+ date: 2011-11-09 00:00:00 +00:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -23,14 +23,13 @@ dependencies:
23
23
  requirement: &id001 !ruby/object:Gem::Requirement
24
24
  none: false
25
25
  requirements:
26
- - - "="
26
+ - - ">="
27
27
  - !ruby/object:Gem::Version
28
28
  segments:
29
- - 1
30
29
  - 0
31
- - 0
32
- - beta1
33
- version: 1.0.0.beta1
30
+ - 10
31
+ - 1
32
+ version: 0.10.1
34
33
  type: :runtime
35
34
  version_requirements: *id001
36
35
  - !ruby/object:Gem::Dependency
@@ -146,6 +145,7 @@ files:
146
145
  - bin/some6.mask.pdf
147
146
  - bin/train.rb
148
147
  - bin/two-column.mask.pdf
148
+ - lib/#language.rb#
149
149
  - lib/analysis/columns.rb
150
150
  - lib/analysis/margins.rb
151
151
  - lib/analysis/sections.rb
@@ -164,12 +164,14 @@ files:
164
164
  - lib/references/references.rb
165
165
  - lib/references/resolve.rb
166
166
  - lib/references/resolved_references.rb
167
+ - lib/references/score.rb
167
168
  - lib/spatial.rb
168
169
  - lib/view/abstract_view.rb
169
170
  - lib/view/pdf_view.rb
170
171
  - lib/view/png_view.rb
171
172
  - lib/view/xml_view.rb
172
173
  - data/familynames.db
174
+ - data/reference.model
173
175
  - data/stopwords.txt
174
176
  has_rdoc: true
175
177
  homepage: http://github.com/CrossRef/pdfextract