pdf-extract 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/data/reference.model +136 -0
- data/lib/#language.rb# +66 -0
- data/lib/analysis/sections.rb +7 -13
- data/lib/language.rb +11 -2
- data/lib/model/characters.rb +38 -12
- data/lib/pdf.rb +17 -10
- data/lib/references/references.rb +38 -14
- data/lib/references/score.rb +28 -0
- data/lib/spatial.rb +14 -33
- metadata +10 -8
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
svm_type c_svc
|
|
2
|
+
kernel_type rbf
|
|
3
|
+
gamma 8
|
|
4
|
+
nr_class 2
|
|
5
|
+
total_sv 127
|
|
6
|
+
rho -0.961432
|
|
7
|
+
label 0 1
|
|
8
|
+
nr_sv 90 37
|
|
9
|
+
SV
|
|
10
|
+
0.01857436364550136 2:-1 3:-0.809524 4:-1 5:0.065
|
|
11
|
+
0.01937561173990551 1:-1 2:-1 3:-1 4:-1 5:-0.9312
|
|
12
|
+
0.0160910161902079 1:1 2:-1 3:-1 4:-1 5:0.15
|
|
13
|
+
0.01400900681690667 1:0.761194 2:-0.777778 3:-1 4:0.725709 5:-0.088
|
|
14
|
+
0.03032223844259175 1:-0.183673 2:-1 3:-1 4:-0.821478 5:-0.0426667
|
|
15
|
+
0.02126950063232856 1:-0.743902 2:-0.391304 3:-1 4:-0.813717 5:-0.768
|
|
16
|
+
0.02113478553748108 1:0.47619 2:-1 3:-1 4:-1 5:0.0933333
|
|
17
|
+
0.01823561274250169 1:0.737705 2:-0.777778 3:-1 4:0.547187 5:-0.224
|
|
18
|
+
0.09700250121827236 1:-0.809524 2:-0.52381 3:-1 4:-0.885236 5:0.32
|
|
19
|
+
0.01766030634427132 1:-0.788079 2:-0.727273 3:-1 4:-0.866109 5:0.258182
|
|
20
|
+
0.02997117793994787 1:-0.428571 2:-1 3:0.333333 4:-0.866109 5:-0.942857
|
|
21
|
+
0.03160597346787591 1:-0.533981 2:-0.363636 3:-1 4:-0.878281 5:-0.96
|
|
22
|
+
0.01883478571632254 1:1 2:-1 3:-1 4:-1 5:0.32
|
|
23
|
+
0.06000320619142035 1:-0.548023 2:-0.25 3:-0.666667 4:-0.788006 5:-0.236364
|
|
24
|
+
0.007004876856992319 1:-0.666667 2:-0.333333 3:-1 4:-1 5:-0.669091
|
|
25
|
+
0.03827464518754259 1:-1 2:1 3:-1 4:-1 5:-0.96
|
|
26
|
+
0.02726286733562099 1:1 2:-1 3:-1 4:-1 5:-0.96
|
|
27
|
+
0.125 1:-0.696203 2:-0.333333 3:-0.666667 4:-0.888424 5:0.229333
|
|
28
|
+
0.03344964665575857 1:0.800784 2:-1 3:-1 4:1 5:0.08
|
|
29
|
+
0.03128222505105369 1:0.630952 2:-1 3:-1 4:-0.439331 5:0.08
|
|
30
|
+
0.125 1:-0.575758 2:-0.666667 3:-1 4:-0.776848 5:-1.11022e-16
|
|
31
|
+
0.125 1:-0.738149 2:-0.770642 3:-0.963303 4:-0.891904 5:0.16
|
|
32
|
+
0.01250576564390459 1:-0.957447 2:-0.492958 3:-1 4:-0.996228 5:-0.9856
|
|
33
|
+
0.03217406791268479 1:-0.00854701 2:-1 3:-1 4:-0.330544 5:-1.11022e-16
|
|
34
|
+
0.02480526634234231 1:-0.971831 2:0.0909091 3:-1 4:-1 5:-0.133333
|
|
35
|
+
0.01096993112524694 1:-1 2:-0.333333 3:-1 4:-1 5:-0.496
|
|
36
|
+
0.012887019582354 1:-0.714286 3:-1 4:-1 5:-0.405333
|
|
37
|
+
0.003331701548896184 1:-0.965812 2:-0.647059 3:-1 4:-1 5:-0.9856
|
|
38
|
+
0.005803393610819437 1:-0.769231 2:-1 3:-1 4:-0.821478 5:-0.8224
|
|
39
|
+
0.006890344962442365 1:-0.818182 2:-0.793103 3:-0.586207 4:-0.981532 5:-0.96
|
|
40
|
+
0.125 1:-0.777778 2:-0.6 3:-0.84 4:-0.946444 5:0.125714
|
|
41
|
+
0.001142586695743055 1:-0.705263 2:-0.466667 3:-1 4:-0.946444 5:-0.87
|
|
42
|
+
0.01168005213986096 1:-0.605263 2:-0.333333 3:-1 4:-0.754533 5:-0.87
|
|
43
|
+
0.125 1:-0.410959 2:-0.777778 3:-0.333333 4:-0.821478 5:0.32
|
|
44
|
+
0.125 1:-0.327869 2:-0.969697 3:-0.69697 4:-0.906682 5:0.184
|
|
45
|
+
0.00677461796062185 1:-0.537037 2:-0.733333 3:-0.733333 4:-0.821478 5:-0.935385
|
|
46
|
+
0.001379215298278784 1:-0.748571 2:-0.652174 3:-1 4:-0.860287 5:-1
|
|
47
|
+
0.0219356287285079 1:0.378378 2:-1 3:-1 4:-1 5:-0.68
|
|
48
|
+
0.02157538481319524 1:-0.388889 2:-1 4:-0.888424 5:-1
|
|
49
|
+
0.03357748970377396 2:-0.684211 3:-0.368421 4:-0.802687 5:-1
|
|
50
|
+
0.003894873430265556 1:-0.958779 2:-0.537688 3:-1 4:-0.997309 5:-0.9856
|
|
51
|
+
0.02545507275489556 1:1 2:-1 3:-1 4:-1 5:-0.632
|
|
52
|
+
0.03899336388799851 1:0.0447761 2:-1 3:1 4:-1 5:-0.088
|
|
53
|
+
0.03208662393251911 1:0.866667 2:-1 3:-1 4:0.472803 5:-0.516923
|
|
54
|
+
0.004986448465924906 1:-1 2:-0.666667 3:-1 4:-1 5:-0.632
|
|
55
|
+
0.009054249612984358 1:-0.707483 2:-0.487179 3:-1 4:-0.766549 5:-0.975238
|
|
56
|
+
0.125 1:-0.130435 2:-1 3:-1 4:-0.933054 5:0.2656
|
|
57
|
+
0.00932561307323757 1:-1 2:-1 3:-1 4:-1 5:-0.7
|
|
58
|
+
0.125 1:-0.167382 2:-0.755102 3:-0.510204 4:-0.912561 5:0.048
|
|
59
|
+
0.01366866452122291 1:-0.773913 2:-0.428571 3:-0.714286 4:-1 5:-0.942857
|
|
60
|
+
0.125 1:-0.610811 2:-0.6 3:-0.84 4:-0.860753 5:-0.133333
|
|
61
|
+
0.03093352314049818 1:-0.603175 2:-0.571429 3:-1 4:-0.751345 5:-0.813333
|
|
62
|
+
0.003235639174723532 1:0.687075 2:-1 3:-1 4:-0.788006 5:0.32
|
|
63
|
+
0.125 1:-0.369574 2:-0.727969 3:-0.877395 4:-0.944084 5:-0.224
|
|
64
|
+
0.03304675897033391 1:-0.79562 2:0.263158 3:-1 4:-0.81678 5:-0.96
|
|
65
|
+
0.04097040264957937 1:-0.760075 2:-0.533679 3:-0.937824 4:-0.968088 5:0.184
|
|
66
|
+
0.125 1:-0.807692 2:-0.502762 3:-0.933702 4:-0.928986 5:0.32
|
|
67
|
+
0.03861958220096956 1:0.857143 3:-1 4:0.606695 5:-0.949333
|
|
68
|
+
0.0241583069877076 1:-0.259259 2:-1 3:-0.333333 4:-0.95537 5:-0.6592
|
|
69
|
+
0.0246536887936159 1:-0.579439 2:-0.612903 3:-1 4:-0.714941 5:-0.942857
|
|
70
|
+
0.125 1:-0.567568 2:-0.6 3:-1 4:-0.973222 5:0.215385
|
|
71
|
+
0.125 1:-0.661538 2:-0.714286 3:-1 4:-0.942618 5:0.215385
|
|
72
|
+
0.00561316195289121 1:-0.953488 2:-0.333333 3:-1 4:-1 5:-0.87
|
|
73
|
+
0.0113461854574554 1:0.777778 2:-1 3:-1 4:-0.732218 5:0.32
|
|
74
|
+
0.125 1:-0.59322 2:-0.733333 3:-1 4:-0.964296 5:0.215385
|
|
75
|
+
0.125 1:-0.59633 2:-0.703704 3:-1 4:-0.960329 5:0.215385
|
|
76
|
+
0.125 1:-0.529412 2:-1 3:-1 4:-0.732218 5:0.215385
|
|
77
|
+
0.125 1:-0.705882 2:-0.791667 3:-1 4:-0.760112 5:0.32
|
|
78
|
+
0.0291523348938202 1:-0.677852 2:-0.44 3:-1 4:-0.817908 5:-0.845714
|
|
79
|
+
0.02288122491162361 1:0.244681 2:-0.944444 3:-0.277778 4:-0.90702 5:-0.53
|
|
80
|
+
0.04322752767081517 1:-0.924961 2:-0.437372 3:-0.975359 4:-0.979655 5:1
|
|
81
|
+
0.125 1:-0.737693 2:-0.674757 3:-0.951456 4:-0.899257 5:0.16
|
|
82
|
+
0.1240880203368809 1:-0.676898 2:-0.781548 3:-1 4:-0.963084 5:0.138667
|
|
83
|
+
0.125 1:-0.63222 2:-0.70979 3:-1 4:-0.956696 5:0.229333
|
|
84
|
+
0.125 1:-0.79017 2:-0.55 3:-0.833333 4:-0.968387 5:0.12
|
|
85
|
+
0.02530601224001184 1:-0.868733 2:-0.587463 3:-0.935469 4:-0.96655 5:0.32
|
|
86
|
+
0.006671484160950634 1:-0.873016 2:-0.0588235 3:-1 4:-0.889737 5:-0.904
|
|
87
|
+
0.125 1:-0.610811 2:-0.6 3:-0.84 4:-0.860753 5:0.32
|
|
88
|
+
0.01570854122336294 1:-0.781818 2:-0.75 3:-0.5 4:-0.966527 5:-0.8768
|
|
89
|
+
0.0836137287825839 1:-0.681159 2:-0.833333 3:-0.666667 4:-0.95537 5:-0.3328
|
|
90
|
+
0.125 1:-0.681159 2:-0.833333 3:-0.666667 4:-0.95537 5:0.2112
|
|
91
|
+
0.03714202104050296 1:-0.769231 2:-1 3:-1 4:-0.821478 5:0.2656
|
|
92
|
+
0.125 1:-0.681159 2:-0.833333 3:-0.666667 4:-0.95537 5:0.32
|
|
93
|
+
0.01857206694116549 1:0.322835 2:-1 3:-0.5 4:-0.944212 5:-0.36
|
|
94
|
+
0.03075145032456461 1:-0.129496 2:-0.853659 3:0.0731707 4:-0.941218 5:-0.088
|
|
95
|
+
0.00632380603735933 1:-0.695652 2:-1 3:-1 4:-0.948994 5:-0.8
|
|
96
|
+
0.003936231589231347 1:-0.575758 2:-0.6 3:-1 4:-0.732218 5:-0.792727
|
|
97
|
+
0.01447392509447366 1:0.111111 2:-1 3:-1 4:-0.785774 5:-0.53
|
|
98
|
+
0.06198936228686536 1:-0.6 2:-1 3:-1 4:-1 5:0.0933333
|
|
99
|
+
0.01295834392175865 1:0.142857 2:-1 3:-1 4:-1 5:-0.7
|
|
100
|
+
-0.125 1:-0.51054 2:-0.574586 3:-0.78453 4:-0.897177 5:0.32
|
|
101
|
+
-0.125 1:-0.474211 2:-0.787234 3:-0.730496 4:-0.84047 5:0.048
|
|
102
|
+
-0.125 1:-0.716069 2:-0.592233 3:-0.883495 4:-0.953203 5:0.229333
|
|
103
|
+
-0.125 1:-0.843511 2:-0.642857 3:-0.904762 4:-0.968121 5:0.32
|
|
104
|
+
-0.125 1:-0.483092 2:-0.75 3:-0.75 4:-0.832636 5:0.168889
|
|
105
|
+
-0.125 1:-0.361538 2:-0.74359 3:-0.74359 4:-0.876408 5:0.32
|
|
106
|
+
-0.125 1:-0.547792 2:-0.690789 3:-0.828947 4:-0.880203 5:0.32
|
|
107
|
+
-0.02063505424396911 1:-0.505391 2:-0.757848 3:-0.820628 4:-0.865508 5:0.32
|
|
108
|
+
-0.125 1:-0.48954 2:-0.670251 3:-0.784946 4:-0.890584 5:0.184
|
|
109
|
+
-0.125 1:-0.393393 2:-0.821429 3:-0.785714 4:-0.890018 5:0.32
|
|
110
|
+
-0.125 1:-0.552613 2:-0.604061 3:-0.796954 4:-0.895334 5:0.32
|
|
111
|
+
-0.125 1:-0.56341 2:-0.704762 3:-0.809524 4:-0.848257 5:0.32
|
|
112
|
+
-0.125 1:-0.543767 2:-0.743119 3:-0.832241 4:-0.880674 5:0.32
|
|
113
|
+
-0.125 1:-0.494214 2:-0.720183 3:-0.724771 4:-0.851983 5:0.16
|
|
114
|
+
-0.125 1:-0.566851 2:-0.662269 3:-0.82058 4:-0.842439 5:0.32
|
|
115
|
+
-0.125 1:-0.521058 2:-0.801829 3:-0.823171 4:-0.853454 5:0.32
|
|
116
|
+
-0.125 1:-0.464121 2:-0.779661 3:-0.737288 4:-0.838877 5:0.424615
|
|
117
|
+
-0.125 1:-0.476154 2:-0.825137 3:-0.759563 4:-0.847818 5:0.456
|
|
118
|
+
-0.125 1:-0.515772 2:-0.833333 3:-0.583333 4:-0.901441 5:0.32
|
|
119
|
+
-0.125 1:-0.527421 2:-0.730413 3:-0.814659 4:-0.894421 5:0.125714
|
|
120
|
+
-0.125 1:-0.524205 2:-0.818616 3:-0.809069 4:-0.880489 5:0.32
|
|
121
|
+
-0.125 1:-0.548023 2:-0.787234 3:-0.829787 4:-0.897445 5:0.125714
|
|
122
|
+
-0.125 1:-0.419355 2:-0.839286 3:-0.5 4:-0.856545 5:0.32
|
|
123
|
+
-0.125 1:-0.515738 2:-0.72093 3:-0.661734 4:-0.899228 5:0.0285714
|
|
124
|
+
-0.125 1:-0.537209 2:-0.811024 3:-0.811024 4:-0.877706 5:0.32
|
|
125
|
+
-0.125 1:-0.512427 2:-0.820175 3:-0.561404 4:-0.90839 5:0.125714
|
|
126
|
+
-0.125 1:-0.566514 2:-0.734375 3:-0.78125 4:-0.851464 5:0.32
|
|
127
|
+
-0.125 1:-0.581746 2:-0.670455 3:-0.818182 4:-0.893496 5:0.32
|
|
128
|
+
-0.125 1:-0.501896 2:-0.760825 3:-0.802062 4:-0.864176 5:0.529231
|
|
129
|
+
-0.125 1:-0.658537 2:-0.481481 3:-1 4:-0.821478 5:-0.845714
|
|
130
|
+
-0.125 1:-0.728738 2:-0.552426 3:-0.862285 4:-0.92373 5:0.32
|
|
131
|
+
-0.125 1:-0.554046 2:-0.572118 3:-0.789812 4:-0.859719 5:0.32
|
|
132
|
+
-0.125 1:-0.515097 2:-0.566098 3:-0.855011 4:-0.830994 5:0.32
|
|
133
|
+
-0.125 1:-0.603835 2:-0.577181 3:-0.704698 4:-0.875993 5:-0.174545
|
|
134
|
+
-0.125 1:-0.57637 2:-0.550278 3:-0.777665 4:-0.875919 5:0.32
|
|
135
|
+
-0.125 1:-0.496503 2:-0.826087 3:-0.826087 4:-0.930144 5:0.32
|
|
136
|
+
-0.125 1:-0.469146 2:-0.714912 3:-0.640351 4:-0.897233 5:0.222857
|
data/lib/#language.rb#
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
require_relative "names"
|
|
2
|
+
|
|
3
|
+
module PdfExtract::Language
|
|
4
|
+
|
|
5
|
+
def self.transliterate s
|
|
6
|
+
s = s.gsub "\ufb01", "fi"
|
|
7
|
+
s = s.gsub "\ufb02", "fl"
|
|
8
|
+
s = s.gsub "\ufb03", "ffi"
|
|
9
|
+
s = s.gsub "\ufb04", "ffl"
|
|
10
|
+
s = s.gsub "\ufb06", "st"
|
|
11
|
+
s = s.gsub "\u2018", "'"
|
|
12
|
+
s = s.gsub "\u2019", "'"
|
|
13
|
+
s = s.gsub "\u2013", "-"
|
|
14
|
+
s = s.gsub "\u2014", "-"
|
|
15
|
+
s = s.gsub "\u201c", "\""
|
|
16
|
+
s = s.gsub "\u201d", "\""
|
|
17
|
+
s = s.gsub "\u25af", "("
|
|
18
|
+
s = s.gsub "\u00b4", ""
|
|
19
|
+
s = s.gsub "\u00b1", "-"
|
|
20
|
+
|
|
21
|
+
s = s.gsub /\s+/, " "
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def self.letter_ratio s
|
|
25
|
+
s.count("A-Z0-9\-[],.\"'()") / s.length.to_f
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# TODO Ignore caps in middle of words
|
|
29
|
+
def self.cap_ratio s
|
|
30
|
+
sentence_end = true
|
|
31
|
+
cap_count = 0
|
|
32
|
+
|
|
33
|
+
s.each_char do |c|
|
|
34
|
+
if c =~ /\./
|
|
35
|
+
sentence_end = true
|
|
36
|
+
elsif c =~ /[A-Z]/
|
|
37
|
+
cap_count = cap_count + 1 unless sentence_end
|
|
38
|
+
sentence_end = false
|
|
39
|
+
elsif c =~ /[^\s]/
|
|
40
|
+
sentence_end = false
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
cap_count / s.split.length.to_f
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def self.year_ratio s
|
|
48
|
+
words = s.split
|
|
49
|
+
|
|
50
|
+
year_words = words.map do |word|
|
|
51
|
+
word =~ /[^\d]\d{4}[^\d]/
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
year_words.reject { |year_word| not year_word }.length / words.length.to_f
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def self.name_ratio content
|
|
58
|
+
PdfExtract::Names.detect_names(content)[:name_frequency]
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def self.word_count s
|
|
62
|
+
s.split.count
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
end
|
|
66
|
+
|
data/lib/analysis/sections.rb
CHANGED
|
@@ -138,21 +138,15 @@ module PdfExtract
|
|
|
138
138
|
sections = add_content_stats sections, pages.keys.count
|
|
139
139
|
|
|
140
140
|
# Score sections into categories based on their textual attributes.
|
|
141
|
-
|
|
142
|
-
:
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
},
|
|
148
|
-
:body => {
|
|
149
|
-
:name_ratio => [0.03, 1],
|
|
150
|
-
:letter_ratio => [0.1, 1],
|
|
151
|
-
:year_ratio => [0.0, 1]
|
|
152
|
-
}
|
|
141
|
+
ref_ideals = {
|
|
142
|
+
:name_ratio => [0.14, 1],
|
|
143
|
+
:letter_ratio => [0.23, 6],
|
|
144
|
+
:year_ratio => [0.05, 10],
|
|
145
|
+
:cap_ratio => [0.49, 10],
|
|
146
|
+
:lateness => [0.96, 6]
|
|
153
147
|
}
|
|
154
148
|
|
|
155
|
-
Spatial.score(sections,
|
|
149
|
+
Spatial.score(sections, ref_ideals, :reference_score)
|
|
156
150
|
|
|
157
151
|
sections
|
|
158
152
|
end
|
data/lib/language.rb
CHANGED
|
@@ -5,12 +5,21 @@ module PdfExtract::Language
|
|
|
5
5
|
def self.transliterate s
|
|
6
6
|
s = s.gsub "\ufb01", "fi"
|
|
7
7
|
s = s.gsub "\ufb02", "fl"
|
|
8
|
+
s = s.gsub "\ufb03", "ffi"
|
|
9
|
+
s = s.gsub "\ufb04", "ffl"
|
|
10
|
+
s = s.gsub "\ufb06", "st"
|
|
8
11
|
s = s.gsub "\u2018", "'"
|
|
9
12
|
s = s.gsub "\u2019", "'"
|
|
10
13
|
s = s.gsub "\u2013", "-"
|
|
14
|
+
s = s.gsub "\u2014", "-"
|
|
11
15
|
s = s.gsub "\u201c", "\""
|
|
12
16
|
s = s.gsub "\u201d", "\""
|
|
13
|
-
s
|
|
17
|
+
s = s.gsub "\u25af", "("
|
|
18
|
+
s = s.gsub "\u00b4", ""
|
|
19
|
+
s = s.gsub "\u00b1", "-"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
s = s.gsub /\s+/, " "
|
|
14
23
|
end
|
|
15
24
|
|
|
16
25
|
def self.letter_ratio s
|
|
@@ -40,7 +49,7 @@ module PdfExtract::Language
|
|
|
40
49
|
words = s.split
|
|
41
50
|
|
|
42
51
|
year_words = words.map do |word|
|
|
43
|
-
word =~
|
|
52
|
+
word =~ /[^\d]\d{4}[^\d]/
|
|
44
53
|
end
|
|
45
54
|
|
|
46
55
|
year_words.reject { |year_word| not year_word }.length / words.length.to_f
|
data/lib/model/characters.rb
CHANGED
|
@@ -49,8 +49,17 @@ module PdfExtract
|
|
|
49
49
|
end
|
|
50
50
|
end
|
|
51
51
|
|
|
52
|
+
def self.find_media_box page, objects
|
|
53
|
+
if page[:MediaBox]
|
|
54
|
+
page[:MediaBox]
|
|
55
|
+
elsif page[:Parent]
|
|
56
|
+
find_media_box objects[page[:Parent]], objects
|
|
57
|
+
else
|
|
58
|
+
[0, 0, 0, 0]
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
52
62
|
def self.make_text_runs text, tj, state, render_state, page, page_number
|
|
53
|
-
# TODO Ignore chars outside the page :MediaBox.
|
|
54
63
|
# TODO Mul UserUnit if specified by page.
|
|
55
64
|
# TODO Include writing mode, so that runs can be joined either
|
|
56
65
|
# virtically or horizontally in the join stage.
|
|
@@ -87,6 +96,8 @@ module PdfExtract
|
|
|
87
96
|
|
|
88
97
|
px = bl_pos.row(0)[0]
|
|
89
98
|
py = bl_pos.row(0)[1]
|
|
99
|
+
|
|
100
|
+
media_box = find_media_box(page.page_object, page.objects)
|
|
90
101
|
|
|
91
102
|
objs << {
|
|
92
103
|
:x => px,
|
|
@@ -97,8 +108,8 @@ module PdfExtract
|
|
|
97
108
|
:content => state.last[:font].to_utf8(c),
|
|
98
109
|
:page => page_number,
|
|
99
110
|
:font => state.last[:font].basefont,
|
|
100
|
-
:page_width =>
|
|
101
|
-
:page_height =>
|
|
111
|
+
:page_width => media_box[2] - media_box[0],
|
|
112
|
+
:page_height => media_box[3] - media_box[1]
|
|
102
113
|
}
|
|
103
114
|
|
|
104
115
|
disp_x, disp_y = glyph_displacement(c, state)
|
|
@@ -114,6 +125,17 @@ module PdfExtract
|
|
|
114
125
|
objs
|
|
115
126
|
end
|
|
116
127
|
|
|
128
|
+
def self.build_fonts page
|
|
129
|
+
fonts = {}
|
|
130
|
+
font_metrics = {}
|
|
131
|
+
page.fonts.each do |label, ref|
|
|
132
|
+
font = PDF::Reader::Font.new(page.objects, page.objects[ref])
|
|
133
|
+
fonts[label] = font
|
|
134
|
+
font_metrics[label] = FontMetrics.new font
|
|
135
|
+
end
|
|
136
|
+
[fonts, font_metrics]
|
|
137
|
+
end
|
|
138
|
+
|
|
117
139
|
def self.include_in pdf
|
|
118
140
|
|
|
119
141
|
pdf.spatials :characters do |parser|
|
|
@@ -127,15 +149,19 @@ module PdfExtract
|
|
|
127
149
|
:tlm => Matrix.identity(3)
|
|
128
150
|
}
|
|
129
151
|
|
|
130
|
-
parser.for :resource_font do |data|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
152
|
+
# parser.for :resource_font do |data|
|
|
153
|
+
# puts data
|
|
154
|
+
# fonts[data[0]] = data[1]
|
|
155
|
+
# font_metrics[data[0]] = FontMetrics.new data[1]
|
|
156
|
+
# nil
|
|
157
|
+
# end
|
|
135
158
|
|
|
136
159
|
parser.for :begin_page do |data|
|
|
137
160
|
page = data[0]
|
|
138
161
|
page_n = page_n.next
|
|
162
|
+
|
|
163
|
+
fonts, font_metrics = build_fonts page
|
|
164
|
+
|
|
139
165
|
state << {
|
|
140
166
|
:h_scale => 100,
|
|
141
167
|
:char_spacing => 0,
|
|
@@ -270,7 +296,7 @@ module PdfExtract
|
|
|
270
296
|
|
|
271
297
|
# Show text operators.
|
|
272
298
|
|
|
273
|
-
parser.for :
|
|
299
|
+
parser.for :set_spacing_next_line_show_text do |data|
|
|
274
300
|
state.last[:word_spacing] = data[0]
|
|
275
301
|
state.last[:char_spacing] = data[1]
|
|
276
302
|
|
|
@@ -282,7 +308,7 @@ module PdfExtract
|
|
|
282
308
|
make_text_runs data[2], 0, state, render_state, page, page_n
|
|
283
309
|
end
|
|
284
310
|
|
|
285
|
-
parser.for :
|
|
311
|
+
parser.for :move_to_next_line_and_show_text do |data|
|
|
286
312
|
render_state[:tm] = Matrix[
|
|
287
313
|
[1, 0, 0], [0, 1, 0], [0, -state.last[:leading], 1]
|
|
288
314
|
] * render_state[:tlm]
|
|
@@ -291,11 +317,11 @@ module PdfExtract
|
|
|
291
317
|
make_text_runs data.first, 0, state, render_state, page, page_n
|
|
292
318
|
end
|
|
293
319
|
|
|
294
|
-
parser.for :
|
|
320
|
+
parser.for :show_text do |data|
|
|
295
321
|
make_text_runs data.first, 0, state, render_state, page, page_n
|
|
296
322
|
end
|
|
297
323
|
|
|
298
|
-
parser.for :
|
|
324
|
+
parser.for :show_text_with_positioning do |data|
|
|
299
325
|
data = data.first
|
|
300
326
|
runs = []
|
|
301
327
|
tj = 0
|
data/lib/pdf.rb
CHANGED
|
@@ -129,9 +129,9 @@ module PdfExtract
|
|
|
129
129
|
end
|
|
130
130
|
|
|
131
131
|
paged_objs.each_pair do |page, objs|
|
|
132
|
-
|
|
132
|
+
call_before
|
|
133
133
|
|
|
134
|
-
if
|
|
134
|
+
if object_calls?
|
|
135
135
|
@object_listeners.each_pair do |type, listeners|
|
|
136
136
|
listeners.each do |listener|
|
|
137
137
|
if objs[type].nil?
|
|
@@ -142,22 +142,29 @@ module PdfExtract
|
|
|
142
142
|
end
|
|
143
143
|
end
|
|
144
144
|
|
|
145
|
-
|
|
145
|
+
call_after
|
|
146
146
|
end
|
|
147
147
|
|
|
148
148
|
else
|
|
149
149
|
|
|
150
|
-
|
|
151
|
-
if
|
|
152
|
-
|
|
150
|
+
call_before
|
|
151
|
+
if object_calls?
|
|
152
|
+
call_object_listeners @pdf.spatial_objects
|
|
153
153
|
end
|
|
154
|
-
|
|
154
|
+
call_after
|
|
155
155
|
|
|
156
156
|
end
|
|
157
157
|
|
|
158
|
-
if
|
|
159
|
-
|
|
160
|
-
PDF::Reader.file filename, self, :raw_text => true
|
|
158
|
+
if for_calls?
|
|
159
|
+
expand_listeners_to_callback_methods
|
|
160
|
+
#PDF::Reader.file filename, self, :raw_text => true
|
|
161
|
+
|
|
162
|
+
reader = PDF::Reader.new filename, :raw_text => true
|
|
163
|
+
reader.pages.each do |page|
|
|
164
|
+
begin_page page
|
|
165
|
+
page.walk self
|
|
166
|
+
end_page page
|
|
167
|
+
end
|
|
161
168
|
end
|
|
162
169
|
end
|
|
163
170
|
|
|
@@ -1,11 +1,14 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
1
2
|
require_relative "../spatial"
|
|
3
|
+
require_relative "score"
|
|
2
4
|
|
|
3
5
|
module PdfExtract
|
|
4
6
|
module References
|
|
5
|
-
|
|
6
|
-
Settings.default :
|
|
7
|
+
|
|
8
|
+
Settings.default :reference_flex, 0.1
|
|
7
9
|
Settings.default :min_sequence_count, 3
|
|
8
10
|
Settings.default :max_reference_order, 1000
|
|
11
|
+
Settings.default :min_lateness , 0.5
|
|
9
12
|
|
|
10
13
|
def self.partition_by ary, &block
|
|
11
14
|
matching = []
|
|
@@ -159,23 +162,44 @@ module PdfExtract
|
|
|
159
162
|
def self.include_in pdf
|
|
160
163
|
pdf.spatials :references, :depends_on => [:sections] do |parser|
|
|
161
164
|
|
|
162
|
-
|
|
165
|
+
sections = []
|
|
163
166
|
|
|
164
167
|
parser.objects :sections do |section|
|
|
165
|
-
|
|
166
|
-
if section[:reference_score] >= pdf.settings[:min_score]
|
|
167
|
-
if numeric_sequence? pdf, Spatial.get_text_content(section)
|
|
168
|
-
refs += split_by_delimiter pdf, Spatial.get_text_content(section)
|
|
169
|
-
elsif multi_margin? section[:lines]
|
|
170
|
-
refs += split_by_margin section[:lines]
|
|
171
|
-
elsif multi_spacing? section[:lines]
|
|
172
|
-
refs += split_by_line_spacing section[:lines]
|
|
173
|
-
end
|
|
174
|
-
end
|
|
168
|
+
sections << section
|
|
175
169
|
end
|
|
176
170
|
|
|
177
171
|
parser.after do
|
|
178
|
-
|
|
172
|
+
max_score = sections.map {|s| s[:reference_score]}.max
|
|
173
|
+
min_permittable = max_score - (max_score * pdf.settings[:reference_flex])
|
|
174
|
+
|
|
175
|
+
refs = []
|
|
176
|
+
|
|
177
|
+
sections = sections.reject do |s|
|
|
178
|
+
# A section without any years is definitely not a list of
|
|
179
|
+
# references. So too a section that appears in the first
|
|
180
|
+
# half of an article.
|
|
181
|
+
s[:lateness] < pdf.settings[:min_lateness] || s[:year_ratio].zero?
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
sections.each do |section|
|
|
185
|
+
if section[:reference_score] >= min_permittable
|
|
186
|
+
if numeric_sequence? pdf, Spatial.get_text_content(section)
|
|
187
|
+
refs += split_by_delimiter pdf, Spatial.get_text_content(section)
|
|
188
|
+
elsif multi_margin? section[:lines]
|
|
189
|
+
refs += split_by_margin section[:lines]
|
|
190
|
+
elsif multi_spacing? section[:lines]
|
|
191
|
+
refs += split_by_line_spacing section[:lines]
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# TODO Ideally we wouldn't see the ref headers here.
|
|
197
|
+
# Unfortunately publication details can look a lot like references.
|
|
198
|
+
refs.reject do |ref|
|
|
199
|
+
norm = ref[:content].downcase.strip
|
|
200
|
+
norm =~ /references?/ || norm =~ /submitted for publication/ || norm =~ /additional contributions/
|
|
201
|
+
end
|
|
202
|
+
|
|
179
203
|
end
|
|
180
204
|
|
|
181
205
|
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require "svm"
|
|
2
|
+
|
|
3
|
+
module PdfExtract
|
|
4
|
+
module Score
|
|
5
|
+
|
|
6
|
+
def self.path_to_data data_filename
|
|
7
|
+
File.join(File.dirname(File.expand_path(__FILE__)), "../../data/" + data_filename)
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
@@reference_model = Model.new(path_to_data("reference.model"))
|
|
11
|
+
|
|
12
|
+
def self.reference? section
|
|
13
|
+
sample = {
|
|
14
|
+
1 => section[:letter_ratio],
|
|
15
|
+
2 => section[:name_ratio],
|
|
16
|
+
3 => section[:year_ratio],
|
|
17
|
+
4 => section[:cap_ratio],
|
|
18
|
+
5 => section[:lateness]
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
puts sample
|
|
22
|
+
|
|
23
|
+
puts @@reference_model.predict(sample)
|
|
24
|
+
@@reference_model.predict(sample) == 1
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
end
|
|
28
|
+
end
|
data/lib/spatial.rb
CHANGED
|
@@ -153,42 +153,23 @@ module PdfExtract
|
|
|
153
153
|
(b_top <= a_top && b_top >= a[from]) || (b[from] >= a[from] && b[from] <= b_top)
|
|
154
154
|
end
|
|
155
155
|
|
|
156
|
-
def self.score items, ideals
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
types.each do |name, vars|
|
|
163
|
-
score_name = (name.to_s + "_score").to_sym
|
|
156
|
+
def self.score items, ideals, name
|
|
157
|
+
ideals.keys.each do |f|
|
|
158
|
+
diffs = items.map {|item| (item[f] - ideals[f][0]).abs}
|
|
159
|
+
diffs.map! {|d| d.nan? ? 1 : d}
|
|
160
|
+
max_diff = diffs.max
|
|
164
161
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
if diff.zero?
|
|
171
|
-
scores << 1.0
|
|
172
|
-
else
|
|
173
|
-
s = 1.0 / diff
|
|
174
|
-
if not s.finite?
|
|
175
|
-
scores << 0.0
|
|
176
|
-
else
|
|
177
|
-
scores << s
|
|
178
|
-
end
|
|
179
|
-
end
|
|
180
|
-
end
|
|
181
|
-
|
|
182
|
-
score_max = scores.max
|
|
183
|
-
weighted_scores = scores.map do |score|
|
|
184
|
-
(score / score_max) * ideals[name][var_name][1]
|
|
162
|
+
scores = diffs.map do |d|
|
|
163
|
+
if d == 0
|
|
164
|
+
ideals[f][1]
|
|
165
|
+
else
|
|
166
|
+
(1 - (d / max_diff)) * ideals[f][1]
|
|
185
167
|
end
|
|
168
|
+
end
|
|
186
169
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
end
|
|
191
|
-
|
|
170
|
+
items.each_index do |i|
|
|
171
|
+
items[i][name] ||= 0
|
|
172
|
+
items[i][name] = items[i][name] + scores[i]
|
|
192
173
|
end
|
|
193
174
|
end
|
|
194
175
|
end
|
metadata
CHANGED
|
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
|
5
5
|
segments:
|
|
6
6
|
- 0
|
|
7
7
|
- 0
|
|
8
|
-
-
|
|
9
|
-
version: 0.0.
|
|
8
|
+
- 7
|
|
9
|
+
version: 0.0.7
|
|
10
10
|
platform: ruby
|
|
11
11
|
authors:
|
|
12
12
|
- Karl Jonathan Ward
|
|
@@ -14,7 +14,7 @@ autorequire:
|
|
|
14
14
|
bindir: bin
|
|
15
15
|
cert_chain: []
|
|
16
16
|
|
|
17
|
-
date: 2011-11-
|
|
17
|
+
date: 2011-11-09 00:00:00 +00:00
|
|
18
18
|
default_executable:
|
|
19
19
|
dependencies:
|
|
20
20
|
- !ruby/object:Gem::Dependency
|
|
@@ -23,14 +23,13 @@ dependencies:
|
|
|
23
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
|
24
24
|
none: false
|
|
25
25
|
requirements:
|
|
26
|
-
- - "
|
|
26
|
+
- - ">="
|
|
27
27
|
- !ruby/object:Gem::Version
|
|
28
28
|
segments:
|
|
29
|
-
- 1
|
|
30
29
|
- 0
|
|
31
|
-
-
|
|
32
|
-
-
|
|
33
|
-
version:
|
|
30
|
+
- 10
|
|
31
|
+
- 1
|
|
32
|
+
version: 0.10.1
|
|
34
33
|
type: :runtime
|
|
35
34
|
version_requirements: *id001
|
|
36
35
|
- !ruby/object:Gem::Dependency
|
|
@@ -146,6 +145,7 @@ files:
|
|
|
146
145
|
- bin/some6.mask.pdf
|
|
147
146
|
- bin/train.rb
|
|
148
147
|
- bin/two-column.mask.pdf
|
|
148
|
+
- lib/#language.rb#
|
|
149
149
|
- lib/analysis/columns.rb
|
|
150
150
|
- lib/analysis/margins.rb
|
|
151
151
|
- lib/analysis/sections.rb
|
|
@@ -164,12 +164,14 @@ files:
|
|
|
164
164
|
- lib/references/references.rb
|
|
165
165
|
- lib/references/resolve.rb
|
|
166
166
|
- lib/references/resolved_references.rb
|
|
167
|
+
- lib/references/score.rb
|
|
167
168
|
- lib/spatial.rb
|
|
168
169
|
- lib/view/abstract_view.rb
|
|
169
170
|
- lib/view/pdf_view.rb
|
|
170
171
|
- lib/view/png_view.rb
|
|
171
172
|
- lib/view/xml_view.rb
|
|
172
173
|
- data/familynames.db
|
|
174
|
+
- data/reference.model
|
|
173
175
|
- data/stopwords.txt
|
|
174
176
|
has_rdoc: true
|
|
175
177
|
homepage: http://github.com/CrossRef/pdfextract
|