pdf-extract 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,136 @@
1
+ svm_type c_svc
2
+ kernel_type rbf
3
+ gamma 8
4
+ nr_class 2
5
+ total_sv 127
6
+ rho -0.961432
7
+ label 0 1
8
+ nr_sv 90 37
9
+ SV
10
+ 0.01857436364550136 2:-1 3:-0.809524 4:-1 5:0.065
11
+ 0.01937561173990551 1:-1 2:-1 3:-1 4:-1 5:-0.9312
12
+ 0.0160910161902079 1:1 2:-1 3:-1 4:-1 5:0.15
13
+ 0.01400900681690667 1:0.761194 2:-0.777778 3:-1 4:0.725709 5:-0.088
14
+ 0.03032223844259175 1:-0.183673 2:-1 3:-1 4:-0.821478 5:-0.0426667
15
+ 0.02126950063232856 1:-0.743902 2:-0.391304 3:-1 4:-0.813717 5:-0.768
16
+ 0.02113478553748108 1:0.47619 2:-1 3:-1 4:-1 5:0.0933333
17
+ 0.01823561274250169 1:0.737705 2:-0.777778 3:-1 4:0.547187 5:-0.224
18
+ 0.09700250121827236 1:-0.809524 2:-0.52381 3:-1 4:-0.885236 5:0.32
19
+ 0.01766030634427132 1:-0.788079 2:-0.727273 3:-1 4:-0.866109 5:0.258182
20
+ 0.02997117793994787 1:-0.428571 2:-1 3:0.333333 4:-0.866109 5:-0.942857
21
+ 0.03160597346787591 1:-0.533981 2:-0.363636 3:-1 4:-0.878281 5:-0.96
22
+ 0.01883478571632254 1:1 2:-1 3:-1 4:-1 5:0.32
23
+ 0.06000320619142035 1:-0.548023 2:-0.25 3:-0.666667 4:-0.788006 5:-0.236364
24
+ 0.007004876856992319 1:-0.666667 2:-0.333333 3:-1 4:-1 5:-0.669091
25
+ 0.03827464518754259 1:-1 2:1 3:-1 4:-1 5:-0.96
26
+ 0.02726286733562099 1:1 2:-1 3:-1 4:-1 5:-0.96
27
+ 0.125 1:-0.696203 2:-0.333333 3:-0.666667 4:-0.888424 5:0.229333
28
+ 0.03344964665575857 1:0.800784 2:-1 3:-1 4:1 5:0.08
29
+ 0.03128222505105369 1:0.630952 2:-1 3:-1 4:-0.439331 5:0.08
30
+ 0.125 1:-0.575758 2:-0.666667 3:-1 4:-0.776848 5:-1.11022e-16
31
+ 0.125 1:-0.738149 2:-0.770642 3:-0.963303 4:-0.891904 5:0.16
32
+ 0.01250576564390459 1:-0.957447 2:-0.492958 3:-1 4:-0.996228 5:-0.9856
33
+ 0.03217406791268479 1:-0.00854701 2:-1 3:-1 4:-0.330544 5:-1.11022e-16
34
+ 0.02480526634234231 1:-0.971831 2:0.0909091 3:-1 4:-1 5:-0.133333
35
+ 0.01096993112524694 1:-1 2:-0.333333 3:-1 4:-1 5:-0.496
36
+ 0.012887019582354 1:-0.714286 3:-1 4:-1 5:-0.405333
37
+ 0.003331701548896184 1:-0.965812 2:-0.647059 3:-1 4:-1 5:-0.9856
38
+ 0.005803393610819437 1:-0.769231 2:-1 3:-1 4:-0.821478 5:-0.8224
39
+ 0.006890344962442365 1:-0.818182 2:-0.793103 3:-0.586207 4:-0.981532 5:-0.96
40
+ 0.125 1:-0.777778 2:-0.6 3:-0.84 4:-0.946444 5:0.125714
41
+ 0.001142586695743055 1:-0.705263 2:-0.466667 3:-1 4:-0.946444 5:-0.87
42
+ 0.01168005213986096 1:-0.605263 2:-0.333333 3:-1 4:-0.754533 5:-0.87
43
+ 0.125 1:-0.410959 2:-0.777778 3:-0.333333 4:-0.821478 5:0.32
44
+ 0.125 1:-0.327869 2:-0.969697 3:-0.69697 4:-0.906682 5:0.184
45
+ 0.00677461796062185 1:-0.537037 2:-0.733333 3:-0.733333 4:-0.821478 5:-0.935385
46
+ 0.001379215298278784 1:-0.748571 2:-0.652174 3:-1 4:-0.860287 5:-1
47
+ 0.0219356287285079 1:0.378378 2:-1 3:-1 4:-1 5:-0.68
48
+ 0.02157538481319524 1:-0.388889 2:-1 4:-0.888424 5:-1
49
+ 0.03357748970377396 2:-0.684211 3:-0.368421 4:-0.802687 5:-1
50
+ 0.003894873430265556 1:-0.958779 2:-0.537688 3:-1 4:-0.997309 5:-0.9856
51
+ 0.02545507275489556 1:1 2:-1 3:-1 4:-1 5:-0.632
52
+ 0.03899336388799851 1:0.0447761 2:-1 3:1 4:-1 5:-0.088
53
+ 0.03208662393251911 1:0.866667 2:-1 3:-1 4:0.472803 5:-0.516923
54
+ 0.004986448465924906 1:-1 2:-0.666667 3:-1 4:-1 5:-0.632
55
+ 0.009054249612984358 1:-0.707483 2:-0.487179 3:-1 4:-0.766549 5:-0.975238
56
+ 0.125 1:-0.130435 2:-1 3:-1 4:-0.933054 5:0.2656
57
+ 0.00932561307323757 1:-1 2:-1 3:-1 4:-1 5:-0.7
58
+ 0.125 1:-0.167382 2:-0.755102 3:-0.510204 4:-0.912561 5:0.048
59
+ 0.01366866452122291 1:-0.773913 2:-0.428571 3:-0.714286 4:-1 5:-0.942857
60
+ 0.125 1:-0.610811 2:-0.6 3:-0.84 4:-0.860753 5:-0.133333
61
+ 0.03093352314049818 1:-0.603175 2:-0.571429 3:-1 4:-0.751345 5:-0.813333
62
+ 0.003235639174723532 1:0.687075 2:-1 3:-1 4:-0.788006 5:0.32
63
+ 0.125 1:-0.369574 2:-0.727969 3:-0.877395 4:-0.944084 5:-0.224
64
+ 0.03304675897033391 1:-0.79562 2:0.263158 3:-1 4:-0.81678 5:-0.96
65
+ 0.04097040264957937 1:-0.760075 2:-0.533679 3:-0.937824 4:-0.968088 5:0.184
66
+ 0.125 1:-0.807692 2:-0.502762 3:-0.933702 4:-0.928986 5:0.32
67
+ 0.03861958220096956 1:0.857143 3:-1 4:0.606695 5:-0.949333
68
+ 0.0241583069877076 1:-0.259259 2:-1 3:-0.333333 4:-0.95537 5:-0.6592
69
+ 0.0246536887936159 1:-0.579439 2:-0.612903 3:-1 4:-0.714941 5:-0.942857
70
+ 0.125 1:-0.567568 2:-0.6 3:-1 4:-0.973222 5:0.215385
71
+ 0.125 1:-0.661538 2:-0.714286 3:-1 4:-0.942618 5:0.215385
72
+ 0.00561316195289121 1:-0.953488 2:-0.333333 3:-1 4:-1 5:-0.87
73
+ 0.0113461854574554 1:0.777778 2:-1 3:-1 4:-0.732218 5:0.32
74
+ 0.125 1:-0.59322 2:-0.733333 3:-1 4:-0.964296 5:0.215385
75
+ 0.125 1:-0.59633 2:-0.703704 3:-1 4:-0.960329 5:0.215385
76
+ 0.125 1:-0.529412 2:-1 3:-1 4:-0.732218 5:0.215385
77
+ 0.125 1:-0.705882 2:-0.791667 3:-1 4:-0.760112 5:0.32
78
+ 0.0291523348938202 1:-0.677852 2:-0.44 3:-1 4:-0.817908 5:-0.845714
79
+ 0.02288122491162361 1:0.244681 2:-0.944444 3:-0.277778 4:-0.90702 5:-0.53
80
+ 0.04322752767081517 1:-0.924961 2:-0.437372 3:-0.975359 4:-0.979655 5:1
81
+ 0.125 1:-0.737693 2:-0.674757 3:-0.951456 4:-0.899257 5:0.16
82
+ 0.1240880203368809 1:-0.676898 2:-0.781548 3:-1 4:-0.963084 5:0.138667
83
+ 0.125 1:-0.63222 2:-0.70979 3:-1 4:-0.956696 5:0.229333
84
+ 0.125 1:-0.79017 2:-0.55 3:-0.833333 4:-0.968387 5:0.12
85
+ 0.02530601224001184 1:-0.868733 2:-0.587463 3:-0.935469 4:-0.96655 5:0.32
86
+ 0.006671484160950634 1:-0.873016 2:-0.0588235 3:-1 4:-0.889737 5:-0.904
87
+ 0.125 1:-0.610811 2:-0.6 3:-0.84 4:-0.860753 5:0.32
88
+ 0.01570854122336294 1:-0.781818 2:-0.75 3:-0.5 4:-0.966527 5:-0.8768
89
+ 0.0836137287825839 1:-0.681159 2:-0.833333 3:-0.666667 4:-0.95537 5:-0.3328
90
+ 0.125 1:-0.681159 2:-0.833333 3:-0.666667 4:-0.95537 5:0.2112
91
+ 0.03714202104050296 1:-0.769231 2:-1 3:-1 4:-0.821478 5:0.2656
92
+ 0.125 1:-0.681159 2:-0.833333 3:-0.666667 4:-0.95537 5:0.32
93
+ 0.01857206694116549 1:0.322835 2:-1 3:-0.5 4:-0.944212 5:-0.36
94
+ 0.03075145032456461 1:-0.129496 2:-0.853659 3:0.0731707 4:-0.941218 5:-0.088
95
+ 0.00632380603735933 1:-0.695652 2:-1 3:-1 4:-0.948994 5:-0.8
96
+ 0.003936231589231347 1:-0.575758 2:-0.6 3:-1 4:-0.732218 5:-0.792727
97
+ 0.01447392509447366 1:0.111111 2:-1 3:-1 4:-0.785774 5:-0.53
98
+ 0.06198936228686536 1:-0.6 2:-1 3:-1 4:-1 5:0.0933333
99
+ 0.01295834392175865 1:0.142857 2:-1 3:-1 4:-1 5:-0.7
100
+ -0.125 1:-0.51054 2:-0.574586 3:-0.78453 4:-0.897177 5:0.32
101
+ -0.125 1:-0.474211 2:-0.787234 3:-0.730496 4:-0.84047 5:0.048
102
+ -0.125 1:-0.716069 2:-0.592233 3:-0.883495 4:-0.953203 5:0.229333
103
+ -0.125 1:-0.843511 2:-0.642857 3:-0.904762 4:-0.968121 5:0.32
104
+ -0.125 1:-0.483092 2:-0.75 3:-0.75 4:-0.832636 5:0.168889
105
+ -0.125 1:-0.361538 2:-0.74359 3:-0.74359 4:-0.876408 5:0.32
106
+ -0.125 1:-0.547792 2:-0.690789 3:-0.828947 4:-0.880203 5:0.32
107
+ -0.02063505424396911 1:-0.505391 2:-0.757848 3:-0.820628 4:-0.865508 5:0.32
108
+ -0.125 1:-0.48954 2:-0.670251 3:-0.784946 4:-0.890584 5:0.184
109
+ -0.125 1:-0.393393 2:-0.821429 3:-0.785714 4:-0.890018 5:0.32
110
+ -0.125 1:-0.552613 2:-0.604061 3:-0.796954 4:-0.895334 5:0.32
111
+ -0.125 1:-0.56341 2:-0.704762 3:-0.809524 4:-0.848257 5:0.32
112
+ -0.125 1:-0.543767 2:-0.743119 3:-0.832241 4:-0.880674 5:0.32
113
+ -0.125 1:-0.494214 2:-0.720183 3:-0.724771 4:-0.851983 5:0.16
114
+ -0.125 1:-0.566851 2:-0.662269 3:-0.82058 4:-0.842439 5:0.32
115
+ -0.125 1:-0.521058 2:-0.801829 3:-0.823171 4:-0.853454 5:0.32
116
+ -0.125 1:-0.464121 2:-0.779661 3:-0.737288 4:-0.838877 5:0.424615
117
+ -0.125 1:-0.476154 2:-0.825137 3:-0.759563 4:-0.847818 5:0.456
118
+ -0.125 1:-0.515772 2:-0.833333 3:-0.583333 4:-0.901441 5:0.32
119
+ -0.125 1:-0.527421 2:-0.730413 3:-0.814659 4:-0.894421 5:0.125714
120
+ -0.125 1:-0.524205 2:-0.818616 3:-0.809069 4:-0.880489 5:0.32
121
+ -0.125 1:-0.548023 2:-0.787234 3:-0.829787 4:-0.897445 5:0.125714
122
+ -0.125 1:-0.419355 2:-0.839286 3:-0.5 4:-0.856545 5:0.32
123
+ -0.125 1:-0.515738 2:-0.72093 3:-0.661734 4:-0.899228 5:0.0285714
124
+ -0.125 1:-0.537209 2:-0.811024 3:-0.811024 4:-0.877706 5:0.32
125
+ -0.125 1:-0.512427 2:-0.820175 3:-0.561404 4:-0.90839 5:0.125714
126
+ -0.125 1:-0.566514 2:-0.734375 3:-0.78125 4:-0.851464 5:0.32
127
+ -0.125 1:-0.581746 2:-0.670455 3:-0.818182 4:-0.893496 5:0.32
128
+ -0.125 1:-0.501896 2:-0.760825 3:-0.802062 4:-0.864176 5:0.529231
129
+ -0.125 1:-0.658537 2:-0.481481 3:-1 4:-0.821478 5:-0.845714
130
+ -0.125 1:-0.728738 2:-0.552426 3:-0.862285 4:-0.92373 5:0.32
131
+ -0.125 1:-0.554046 2:-0.572118 3:-0.789812 4:-0.859719 5:0.32
132
+ -0.125 1:-0.515097 2:-0.566098 3:-0.855011 4:-0.830994 5:0.32
133
+ -0.125 1:-0.603835 2:-0.577181 3:-0.704698 4:-0.875993 5:-0.174545
134
+ -0.125 1:-0.57637 2:-0.550278 3:-0.777665 4:-0.875919 5:0.32
135
+ -0.125 1:-0.496503 2:-0.826087 3:-0.826087 4:-0.930144 5:0.32
136
+ -0.125 1:-0.469146 2:-0.714912 3:-0.640351 4:-0.897233 5:0.222857
data/lib/#language.rb# ADDED
@@ -0,0 +1,66 @@
1
+ require_relative "names"
2
+
3
+ module PdfExtract::Language
4
+
5
+ def self.transliterate s
6
+ s = s.gsub "\ufb01", "fi"
7
+ s = s.gsub "\ufb02", "fl"
8
+ s = s.gsub "\ufb03", "ffi"
9
+ s = s.gsub "\ufb04", "ffl"
10
+ s = s.gsub "\ufb06", "st"
11
+ s = s.gsub "\u2018", "'"
12
+ s = s.gsub "\u2019", "'"
13
+ s = s.gsub "\u2013", "-"
14
+ s = s.gsub "\u2014", "-"
15
+ s = s.gsub "\u201c", "\""
16
+ s = s.gsub "\u201d", "\""
17
+ s = s.gsub "\u25af", "("
18
+ s = s.gsub "\u00b4", ""
19
+ s = s.gsub "\u00b1", "-"
20
+
21
+ s = s.gsub /\s+/, " "
22
+ end
23
+
24
+ def self.letter_ratio s
25
+ s.count("A-Z0-9\-[],.\"'()") / s.length.to_f
26
+ end
27
+
28
+ # TODO Ignore caps in middle of words
29
+ def self.cap_ratio s
30
+ sentence_end = true
31
+ cap_count = 0
32
+
33
+ s.each_char do |c|
34
+ if c =~ /\./
35
+ sentence_end = true
36
+ elsif c =~ /[A-Z]/
37
+ cap_count = cap_count + 1 unless sentence_end
38
+ sentence_end = false
39
+ elsif c =~ /[^\s]/
40
+ sentence_end = false
41
+ end
42
+ end
43
+
44
+ cap_count / s.split.length.to_f
45
+ end
46
+
47
+ def self.year_ratio s
48
+ words = s.split
49
+
50
+ year_words = words.map do |word|
51
+ word =~ /[^\d]\d{4}[^\d]/
52
+ end
53
+
54
+ year_words.reject { |year_word| not year_word }.length / words.length.to_f
55
+ end
56
+
57
+ def self.name_ratio content
58
+ PdfExtract::Names.detect_names(content)[:name_frequency]
59
+ end
60
+
61
+ def self.word_count s
62
+ s.split.count
63
+ end
64
+
65
+ end
66
+
@@ -138,21 +138,15 @@ module PdfExtract
138
138
  sections = add_content_stats sections, pages.keys.count
139
139
 
140
140
  # Score sections into categories based on their textual attributes.
141
- ideals = {
142
- :reference => {
143
- :name_ratio => [0.2, 2],
144
- :letter_ratio => [0.25, 2],
145
- :year_ratio => [0.05, 7],
146
- :cap_ratio => [0.5, 5]
147
- },
148
- :body => {
149
- :name_ratio => [0.03, 1],
150
- :letter_ratio => [0.1, 1],
151
- :year_ratio => [0.0, 1]
152
- }
141
+ ref_ideals = {
142
+ :name_ratio => [0.14, 1],
143
+ :letter_ratio => [0.23, 6],
144
+ :year_ratio => [0.05, 10],
145
+ :cap_ratio => [0.49, 10],
146
+ :lateness => [0.96, 6]
153
147
  }
154
148
 
155
- Spatial.score(sections, ideals)
149
+ Spatial.score(sections, ref_ideals, :reference_score)
156
150
 
157
151
  sections
158
152
  end
data/lib/language.rb CHANGED
@@ -5,12 +5,21 @@ module PdfExtract::Language
5
5
  def self.transliterate s
6
6
  s = s.gsub "\ufb01", "fi"
7
7
  s = s.gsub "\ufb02", "fl"
8
+ s = s.gsub "\ufb03", "ffi"
9
+ s = s.gsub "\ufb04", "ffl"
10
+ s = s.gsub "\ufb06", "st"
8
11
  s = s.gsub "\u2018", "'"
9
12
  s = s.gsub "\u2019", "'"
10
13
  s = s.gsub "\u2013", "-"
14
+ s = s.gsub "\u2014", "-"
11
15
  s = s.gsub "\u201c", "\""
12
16
  s = s.gsub "\u201d", "\""
13
- s
17
+ s = s.gsub "\u25af", "("
18
+ s = s.gsub "\u00b4", ""
19
+ s = s.gsub "\u00b1", "-"
20
+
21
+
22
+ s = s.gsub /\s+/, " "
14
23
  end
15
24
 
16
25
  def self.letter_ratio s
@@ -40,7 +49,7 @@ module PdfExtract::Language
40
49
  words = s.split
41
50
 
42
51
  year_words = words.map do |word|
43
- word =~ /\.*\d{4}\.*/
52
+ word =~ /[^\d]\d{4}[^\d]/
44
53
  end
45
54
 
46
55
  year_words.reject { |year_word| not year_word }.length / words.length.to_f
@@ -49,8 +49,17 @@ module PdfExtract
49
49
  end
50
50
  end
51
51
 
52
+ def self.find_media_box page, objects
53
+ if page[:MediaBox]
54
+ page[:MediaBox]
55
+ elsif page[:Parent]
56
+ find_media_box objects[page[:Parent]], objects
57
+ else
58
+ [0, 0, 0, 0]
59
+ end
60
+ end
61
+
52
62
  def self.make_text_runs text, tj, state, render_state, page, page_number
53
- # TODO Ignore chars outside the page :MediaBox.
54
63
  # TODO Mul UserUnit if specified by page.
55
64
  # TODO Include writing mode, so that runs can be joined either
56
65
  # virtically or horizontally in the join stage.
@@ -87,6 +96,8 @@ module PdfExtract
87
96
 
88
97
  px = bl_pos.row(0)[0]
89
98
  py = bl_pos.row(0)[1]
99
+
100
+ media_box = find_media_box(page.page_object, page.objects)
90
101
 
91
102
  objs << {
92
103
  :x => px,
@@ -97,8 +108,8 @@ module PdfExtract
97
108
  :content => state.last[:font].to_utf8(c),
98
109
  :page => page_number,
99
110
  :font => state.last[:font].basefont,
100
- :page_width => page[:MediaBox][2] - page[:MediaBox][0],
101
- :page_height => page[:MediaBox][3] - page[:MediaBox][1]
111
+ :page_width => media_box[2] - media_box[0],
112
+ :page_height => media_box[3] - media_box[1]
102
113
  }
103
114
 
104
115
  disp_x, disp_y = glyph_displacement(c, state)
@@ -114,6 +125,17 @@ module PdfExtract
114
125
  objs
115
126
  end
116
127
 
128
+ def self.build_fonts page
129
+ fonts = {}
130
+ font_metrics = {}
131
+ page.fonts.each do |label, ref|
132
+ font = PDF::Reader::Font.new(page.objects, page.objects[ref])
133
+ fonts[label] = font
134
+ font_metrics[label] = FontMetrics.new font
135
+ end
136
+ [fonts, font_metrics]
137
+ end
138
+
117
139
  def self.include_in pdf
118
140
 
119
141
  pdf.spatials :characters do |parser|
@@ -127,15 +149,19 @@ module PdfExtract
127
149
  :tlm => Matrix.identity(3)
128
150
  }
129
151
 
130
- parser.for :resource_font do |data|
131
- fonts[data[0]] = data[1]
132
- font_metrics[data[0]] = FontMetrics.new data[1]
133
- nil
134
- end
152
+ # parser.for :resource_font do |data|
153
+ # puts data
154
+ # fonts[data[0]] = data[1]
155
+ # font_metrics[data[0]] = FontMetrics.new data[1]
156
+ # nil
157
+ # end
135
158
 
136
159
  parser.for :begin_page do |data|
137
160
  page = data[0]
138
161
  page_n = page_n.next
162
+
163
+ fonts, font_metrics = build_fonts page
164
+
139
165
  state << {
140
166
  :h_scale => 100,
141
167
  :char_spacing => 0,
@@ -270,7 +296,7 @@ module PdfExtract
270
296
 
271
297
  # Show text operators.
272
298
 
273
- parser.for :set_spacing_next_line_show_text_raw do |data|
299
+ parser.for :set_spacing_next_line_show_text do |data|
274
300
  state.last[:word_spacing] = data[0]
275
301
  state.last[:char_spacing] = data[1]
276
302
 
@@ -282,7 +308,7 @@ module PdfExtract
282
308
  make_text_runs data[2], 0, state, render_state, page, page_n
283
309
  end
284
310
 
285
- parser.for :move_to_next_line_and_show_text_raw do |data|
311
+ parser.for :move_to_next_line_and_show_text do |data|
286
312
  render_state[:tm] = Matrix[
287
313
  [1, 0, 0], [0, 1, 0], [0, -state.last[:leading], 1]
288
314
  ] * render_state[:tlm]
@@ -291,11 +317,11 @@ module PdfExtract
291
317
  make_text_runs data.first, 0, state, render_state, page, page_n
292
318
  end
293
319
 
294
- parser.for :show_text_raw do |data|
320
+ parser.for :show_text do |data|
295
321
  make_text_runs data.first, 0, state, render_state, page, page_n
296
322
  end
297
323
 
298
- parser.for :show_text_with_positioning_raw do |data|
324
+ parser.for :show_text_with_positioning do |data|
299
325
  data = data.first
300
326
  runs = []
301
327
  tj = 0
data/lib/pdf.rb CHANGED
@@ -129,9 +129,9 @@ module PdfExtract
129
129
  end
130
130
 
131
131
  paged_objs.each_pair do |page, objs|
132
- self.call_before
132
+ call_before
133
133
 
134
- if self.object_calls?
134
+ if object_calls?
135
135
  @object_listeners.each_pair do |type, listeners|
136
136
  listeners.each do |listener|
137
137
  if objs[type].nil?
@@ -142,22 +142,29 @@ module PdfExtract
142
142
  end
143
143
  end
144
144
 
145
- self.call_after
145
+ call_after
146
146
  end
147
147
 
148
148
  else
149
149
 
150
- self.call_before
151
- if self.object_calls?
152
- self.call_object_listeners @pdf.spatial_objects
150
+ call_before
151
+ if object_calls?
152
+ call_object_listeners @pdf.spatial_objects
153
153
  end
154
- self.call_after
154
+ call_after
155
155
 
156
156
  end
157
157
 
158
- if self.for_calls?
159
- self.expand_listeners_to_callback_methods
160
- PDF::Reader.file filename, self, :raw_text => true
158
+ if for_calls?
159
+ expand_listeners_to_callback_methods
160
+ #PDF::Reader.file filename, self, :raw_text => true
161
+
162
+ reader = PDF::Reader.new filename, :raw_text => true
163
+ reader.pages.each do |page|
164
+ begin_page page
165
+ page.walk self
166
+ end_page page
167
+ end
161
168
  end
162
169
  end
163
170
 
@@ -1,11 +1,14 @@
1
+ # -*- coding: utf-8 -*-
1
2
  require_relative "../spatial"
3
+ require_relative "score"
2
4
 
3
5
  module PdfExtract
4
6
  module References
5
-
6
- Settings.default :min_score, 8.0
7
+
8
+ Settings.default :reference_flex, 0.1
7
9
  Settings.default :min_sequence_count, 3
8
10
  Settings.default :max_reference_order, 1000
11
+ Settings.default :min_lateness , 0.5
9
12
 
10
13
  def self.partition_by ary, &block
11
14
  matching = []
@@ -159,23 +162,44 @@ module PdfExtract
159
162
  def self.include_in pdf
160
163
  pdf.spatials :references, :depends_on => [:sections] do |parser|
161
164
 
162
- refs = []
165
+ sections = []
163
166
 
164
167
  parser.objects :sections do |section|
165
- # TODO Take top x%, fix Infinity coming back from score.
166
- if section[:reference_score] >= pdf.settings[:min_score]
167
- if numeric_sequence? pdf, Spatial.get_text_content(section)
168
- refs += split_by_delimiter pdf, Spatial.get_text_content(section)
169
- elsif multi_margin? section[:lines]
170
- refs += split_by_margin section[:lines]
171
- elsif multi_spacing? section[:lines]
172
- refs += split_by_line_spacing section[:lines]
173
- end
174
- end
168
+ sections << section
175
169
  end
176
170
 
177
171
  parser.after do
178
- refs
172
+ max_score = sections.map {|s| s[:reference_score]}.max
173
+ min_permittable = max_score - (max_score * pdf.settings[:reference_flex])
174
+
175
+ refs = []
176
+
177
+ sections = sections.reject do |s|
178
+ # A section without any years is definitely not a list of
179
+ # references. So too a section that appears in the first
180
+ # half of an article.
181
+ s[:lateness] < pdf.settings[:min_lateness] || s[:year_ratio].zero?
182
+ end
183
+
184
+ sections.each do |section|
185
+ if section[:reference_score] >= min_permittable
186
+ if numeric_sequence? pdf, Spatial.get_text_content(section)
187
+ refs += split_by_delimiter pdf, Spatial.get_text_content(section)
188
+ elsif multi_margin? section[:lines]
189
+ refs += split_by_margin section[:lines]
190
+ elsif multi_spacing? section[:lines]
191
+ refs += split_by_line_spacing section[:lines]
192
+ end
193
+ end
194
+ end
195
+
196
+ # TODO Ideally we wouldn't see the ref headers here.
197
+ # Unfortunately publication details can look a lot like references.
198
+ refs.reject do |ref|
199
+ norm = ref[:content].downcase.strip
200
+ norm =~ /references?/ || norm =~ /submitted for publication/ || norm =~ /additional contributions/
201
+ end
202
+
179
203
  end
180
204
 
181
205
  end
@@ -0,0 +1,28 @@
1
+ require "svm"
2
+
3
+ module PdfExtract
4
+ module Score
5
+
6
+ def self.path_to_data data_filename
7
+ File.join(File.dirname(File.expand_path(__FILE__)), "../../data/" + data_filename)
8
+ end
9
+
10
+ @@reference_model = Model.new(path_to_data("reference.model"))
11
+
12
+ def self.reference? section
13
+ sample = {
14
+ 1 => section[:letter_ratio],
15
+ 2 => section[:name_ratio],
16
+ 3 => section[:year_ratio],
17
+ 4 => section[:cap_ratio],
18
+ 5 => section[:lateness]
19
+ }
20
+
21
+ puts sample
22
+
23
+ puts @@reference_model.predict(sample)
24
+ @@reference_model.predict(sample) == 1
25
+ end
26
+
27
+ end
28
+ end
data/lib/spatial.rb CHANGED
@@ -153,42 +153,23 @@ module PdfExtract
153
153
  (b_top <= a_top && b_top >= a[from]) || (b[from] >= a[from] && b[from] <= b_top)
154
154
  end
155
155
 
156
- def self.score items, ideals
157
- types = {}
158
- ideals.keys.each do |name|
159
- types[name] = ideals[name].keys
160
- end
161
-
162
- types.each do |name, vars|
163
- score_name = (name.to_s + "_score").to_sym
156
+ def self.score items, ideals, name
157
+ ideals.keys.each do |f|
158
+ diffs = items.map {|item| (item[f] - ideals[f][0]).abs}
159
+ diffs.map! {|d| d.nan? ? 1 : d}
160
+ max_diff = diffs.max
164
161
 
165
- vars.each do |var_name|
166
-
167
- scores = []
168
- items.each do |item|
169
- diff = (item[var_name] - ideals[name][var_name][0]).abs
170
- if diff.zero?
171
- scores << 1.0
172
- else
173
- s = 1.0 / diff
174
- if not s.finite?
175
- scores << 0.0
176
- else
177
- scores << s
178
- end
179
- end
180
- end
181
-
182
- score_max = scores.max
183
- weighted_scores = scores.map do |score|
184
- (score / score_max) * ideals[name][var_name][1]
162
+ scores = diffs.map do |d|
163
+ if d == 0
164
+ ideals[f][1]
165
+ else
166
+ (1 - (d / max_diff)) * ideals[f][1]
185
167
  end
168
+ end
186
169
 
187
- items.each_index do |idx|
188
- items[idx][score_name] ||= 0.0
189
- items[idx][score_name] += weighted_scores[idx]
190
- end
191
-
170
+ items.each_index do |i|
171
+ items[i][name] ||= 0
172
+ items[i][name] = items[i][name] + scores[i]
192
173
  end
193
174
  end
194
175
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 6
9
- version: 0.0.6
8
+ - 7
9
+ version: 0.0.7
10
10
  platform: ruby
11
11
  authors:
12
12
  - Karl Jonathan Ward
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-11-02 00:00:00 +00:00
17
+ date: 2011-11-09 00:00:00 +00:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -23,14 +23,13 @@ dependencies:
23
23
  requirement: &id001 !ruby/object:Gem::Requirement
24
24
  none: false
25
25
  requirements:
26
- - - "="
26
+ - - ">="
27
27
  - !ruby/object:Gem::Version
28
28
  segments:
29
- - 1
30
29
  - 0
31
- - 0
32
- - beta1
33
- version: 1.0.0.beta1
30
+ - 10
31
+ - 1
32
+ version: 0.10.1
34
33
  type: :runtime
35
34
  version_requirements: *id001
36
35
  - !ruby/object:Gem::Dependency
@@ -146,6 +145,7 @@ files:
146
145
  - bin/some6.mask.pdf
147
146
  - bin/train.rb
148
147
  - bin/two-column.mask.pdf
148
+ - lib/#language.rb#
149
149
  - lib/analysis/columns.rb
150
150
  - lib/analysis/margins.rb
151
151
  - lib/analysis/sections.rb
@@ -164,12 +164,14 @@ files:
164
164
  - lib/references/references.rb
165
165
  - lib/references/resolve.rb
166
166
  - lib/references/resolved_references.rb
167
+ - lib/references/score.rb
167
168
  - lib/spatial.rb
168
169
  - lib/view/abstract_view.rb
169
170
  - lib/view/pdf_view.rb
170
171
  - lib/view/png_view.rb
171
172
  - lib/view/xml_view.rb
172
173
  - data/familynames.db
174
+ - data/reference.model
173
175
  - data/stopwords.txt
174
176
  has_rdoc: true
175
177
  homepage: http://github.com/CrossRef/pdfextract