pdf-extract 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/data/reference.model +136 -0
- data/lib/#language.rb# +66 -0
- data/lib/analysis/sections.rb +7 -13
- data/lib/language.rb +11 -2
- data/lib/model/characters.rb +38 -12
- data/lib/pdf.rb +17 -10
- data/lib/references/references.rb +38 -14
- data/lib/references/score.rb +28 -0
- data/lib/spatial.rb +14 -33
- metadata +10 -8
@@ -0,0 +1,136 @@
|
|
1
|
+
svm_type c_svc
|
2
|
+
kernel_type rbf
|
3
|
+
gamma 8
|
4
|
+
nr_class 2
|
5
|
+
total_sv 127
|
6
|
+
rho -0.961432
|
7
|
+
label 0 1
|
8
|
+
nr_sv 90 37
|
9
|
+
SV
|
10
|
+
0.01857436364550136 2:-1 3:-0.809524 4:-1 5:0.065
|
11
|
+
0.01937561173990551 1:-1 2:-1 3:-1 4:-1 5:-0.9312
|
12
|
+
0.0160910161902079 1:1 2:-1 3:-1 4:-1 5:0.15
|
13
|
+
0.01400900681690667 1:0.761194 2:-0.777778 3:-1 4:0.725709 5:-0.088
|
14
|
+
0.03032223844259175 1:-0.183673 2:-1 3:-1 4:-0.821478 5:-0.0426667
|
15
|
+
0.02126950063232856 1:-0.743902 2:-0.391304 3:-1 4:-0.813717 5:-0.768
|
16
|
+
0.02113478553748108 1:0.47619 2:-1 3:-1 4:-1 5:0.0933333
|
17
|
+
0.01823561274250169 1:0.737705 2:-0.777778 3:-1 4:0.547187 5:-0.224
|
18
|
+
0.09700250121827236 1:-0.809524 2:-0.52381 3:-1 4:-0.885236 5:0.32
|
19
|
+
0.01766030634427132 1:-0.788079 2:-0.727273 3:-1 4:-0.866109 5:0.258182
|
20
|
+
0.02997117793994787 1:-0.428571 2:-1 3:0.333333 4:-0.866109 5:-0.942857
|
21
|
+
0.03160597346787591 1:-0.533981 2:-0.363636 3:-1 4:-0.878281 5:-0.96
|
22
|
+
0.01883478571632254 1:1 2:-1 3:-1 4:-1 5:0.32
|
23
|
+
0.06000320619142035 1:-0.548023 2:-0.25 3:-0.666667 4:-0.788006 5:-0.236364
|
24
|
+
0.007004876856992319 1:-0.666667 2:-0.333333 3:-1 4:-1 5:-0.669091
|
25
|
+
0.03827464518754259 1:-1 2:1 3:-1 4:-1 5:-0.96
|
26
|
+
0.02726286733562099 1:1 2:-1 3:-1 4:-1 5:-0.96
|
27
|
+
0.125 1:-0.696203 2:-0.333333 3:-0.666667 4:-0.888424 5:0.229333
|
28
|
+
0.03344964665575857 1:0.800784 2:-1 3:-1 4:1 5:0.08
|
29
|
+
0.03128222505105369 1:0.630952 2:-1 3:-1 4:-0.439331 5:0.08
|
30
|
+
0.125 1:-0.575758 2:-0.666667 3:-1 4:-0.776848 5:-1.11022e-16
|
31
|
+
0.125 1:-0.738149 2:-0.770642 3:-0.963303 4:-0.891904 5:0.16
|
32
|
+
0.01250576564390459 1:-0.957447 2:-0.492958 3:-1 4:-0.996228 5:-0.9856
|
33
|
+
0.03217406791268479 1:-0.00854701 2:-1 3:-1 4:-0.330544 5:-1.11022e-16
|
34
|
+
0.02480526634234231 1:-0.971831 2:0.0909091 3:-1 4:-1 5:-0.133333
|
35
|
+
0.01096993112524694 1:-1 2:-0.333333 3:-1 4:-1 5:-0.496
|
36
|
+
0.012887019582354 1:-0.714286 3:-1 4:-1 5:-0.405333
|
37
|
+
0.003331701548896184 1:-0.965812 2:-0.647059 3:-1 4:-1 5:-0.9856
|
38
|
+
0.005803393610819437 1:-0.769231 2:-1 3:-1 4:-0.821478 5:-0.8224
|
39
|
+
0.006890344962442365 1:-0.818182 2:-0.793103 3:-0.586207 4:-0.981532 5:-0.96
|
40
|
+
0.125 1:-0.777778 2:-0.6 3:-0.84 4:-0.946444 5:0.125714
|
41
|
+
0.001142586695743055 1:-0.705263 2:-0.466667 3:-1 4:-0.946444 5:-0.87
|
42
|
+
0.01168005213986096 1:-0.605263 2:-0.333333 3:-1 4:-0.754533 5:-0.87
|
43
|
+
0.125 1:-0.410959 2:-0.777778 3:-0.333333 4:-0.821478 5:0.32
|
44
|
+
0.125 1:-0.327869 2:-0.969697 3:-0.69697 4:-0.906682 5:0.184
|
45
|
+
0.00677461796062185 1:-0.537037 2:-0.733333 3:-0.733333 4:-0.821478 5:-0.935385
|
46
|
+
0.001379215298278784 1:-0.748571 2:-0.652174 3:-1 4:-0.860287 5:-1
|
47
|
+
0.0219356287285079 1:0.378378 2:-1 3:-1 4:-1 5:-0.68
|
48
|
+
0.02157538481319524 1:-0.388889 2:-1 4:-0.888424 5:-1
|
49
|
+
0.03357748970377396 2:-0.684211 3:-0.368421 4:-0.802687 5:-1
|
50
|
+
0.003894873430265556 1:-0.958779 2:-0.537688 3:-1 4:-0.997309 5:-0.9856
|
51
|
+
0.02545507275489556 1:1 2:-1 3:-1 4:-1 5:-0.632
|
52
|
+
0.03899336388799851 1:0.0447761 2:-1 3:1 4:-1 5:-0.088
|
53
|
+
0.03208662393251911 1:0.866667 2:-1 3:-1 4:0.472803 5:-0.516923
|
54
|
+
0.004986448465924906 1:-1 2:-0.666667 3:-1 4:-1 5:-0.632
|
55
|
+
0.009054249612984358 1:-0.707483 2:-0.487179 3:-1 4:-0.766549 5:-0.975238
|
56
|
+
0.125 1:-0.130435 2:-1 3:-1 4:-0.933054 5:0.2656
|
57
|
+
0.00932561307323757 1:-1 2:-1 3:-1 4:-1 5:-0.7
|
58
|
+
0.125 1:-0.167382 2:-0.755102 3:-0.510204 4:-0.912561 5:0.048
|
59
|
+
0.01366866452122291 1:-0.773913 2:-0.428571 3:-0.714286 4:-1 5:-0.942857
|
60
|
+
0.125 1:-0.610811 2:-0.6 3:-0.84 4:-0.860753 5:-0.133333
|
61
|
+
0.03093352314049818 1:-0.603175 2:-0.571429 3:-1 4:-0.751345 5:-0.813333
|
62
|
+
0.003235639174723532 1:0.687075 2:-1 3:-1 4:-0.788006 5:0.32
|
63
|
+
0.125 1:-0.369574 2:-0.727969 3:-0.877395 4:-0.944084 5:-0.224
|
64
|
+
0.03304675897033391 1:-0.79562 2:0.263158 3:-1 4:-0.81678 5:-0.96
|
65
|
+
0.04097040264957937 1:-0.760075 2:-0.533679 3:-0.937824 4:-0.968088 5:0.184
|
66
|
+
0.125 1:-0.807692 2:-0.502762 3:-0.933702 4:-0.928986 5:0.32
|
67
|
+
0.03861958220096956 1:0.857143 3:-1 4:0.606695 5:-0.949333
|
68
|
+
0.0241583069877076 1:-0.259259 2:-1 3:-0.333333 4:-0.95537 5:-0.6592
|
69
|
+
0.0246536887936159 1:-0.579439 2:-0.612903 3:-1 4:-0.714941 5:-0.942857
|
70
|
+
0.125 1:-0.567568 2:-0.6 3:-1 4:-0.973222 5:0.215385
|
71
|
+
0.125 1:-0.661538 2:-0.714286 3:-1 4:-0.942618 5:0.215385
|
72
|
+
0.00561316195289121 1:-0.953488 2:-0.333333 3:-1 4:-1 5:-0.87
|
73
|
+
0.0113461854574554 1:0.777778 2:-1 3:-1 4:-0.732218 5:0.32
|
74
|
+
0.125 1:-0.59322 2:-0.733333 3:-1 4:-0.964296 5:0.215385
|
75
|
+
0.125 1:-0.59633 2:-0.703704 3:-1 4:-0.960329 5:0.215385
|
76
|
+
0.125 1:-0.529412 2:-1 3:-1 4:-0.732218 5:0.215385
|
77
|
+
0.125 1:-0.705882 2:-0.791667 3:-1 4:-0.760112 5:0.32
|
78
|
+
0.0291523348938202 1:-0.677852 2:-0.44 3:-1 4:-0.817908 5:-0.845714
|
79
|
+
0.02288122491162361 1:0.244681 2:-0.944444 3:-0.277778 4:-0.90702 5:-0.53
|
80
|
+
0.04322752767081517 1:-0.924961 2:-0.437372 3:-0.975359 4:-0.979655 5:1
|
81
|
+
0.125 1:-0.737693 2:-0.674757 3:-0.951456 4:-0.899257 5:0.16
|
82
|
+
0.1240880203368809 1:-0.676898 2:-0.781548 3:-1 4:-0.963084 5:0.138667
|
83
|
+
0.125 1:-0.63222 2:-0.70979 3:-1 4:-0.956696 5:0.229333
|
84
|
+
0.125 1:-0.79017 2:-0.55 3:-0.833333 4:-0.968387 5:0.12
|
85
|
+
0.02530601224001184 1:-0.868733 2:-0.587463 3:-0.935469 4:-0.96655 5:0.32
|
86
|
+
0.006671484160950634 1:-0.873016 2:-0.0588235 3:-1 4:-0.889737 5:-0.904
|
87
|
+
0.125 1:-0.610811 2:-0.6 3:-0.84 4:-0.860753 5:0.32
|
88
|
+
0.01570854122336294 1:-0.781818 2:-0.75 3:-0.5 4:-0.966527 5:-0.8768
|
89
|
+
0.0836137287825839 1:-0.681159 2:-0.833333 3:-0.666667 4:-0.95537 5:-0.3328
|
90
|
+
0.125 1:-0.681159 2:-0.833333 3:-0.666667 4:-0.95537 5:0.2112
|
91
|
+
0.03714202104050296 1:-0.769231 2:-1 3:-1 4:-0.821478 5:0.2656
|
92
|
+
0.125 1:-0.681159 2:-0.833333 3:-0.666667 4:-0.95537 5:0.32
|
93
|
+
0.01857206694116549 1:0.322835 2:-1 3:-0.5 4:-0.944212 5:-0.36
|
94
|
+
0.03075145032456461 1:-0.129496 2:-0.853659 3:0.0731707 4:-0.941218 5:-0.088
|
95
|
+
0.00632380603735933 1:-0.695652 2:-1 3:-1 4:-0.948994 5:-0.8
|
96
|
+
0.003936231589231347 1:-0.575758 2:-0.6 3:-1 4:-0.732218 5:-0.792727
|
97
|
+
0.01447392509447366 1:0.111111 2:-1 3:-1 4:-0.785774 5:-0.53
|
98
|
+
0.06198936228686536 1:-0.6 2:-1 3:-1 4:-1 5:0.0933333
|
99
|
+
0.01295834392175865 1:0.142857 2:-1 3:-1 4:-1 5:-0.7
|
100
|
+
-0.125 1:-0.51054 2:-0.574586 3:-0.78453 4:-0.897177 5:0.32
|
101
|
+
-0.125 1:-0.474211 2:-0.787234 3:-0.730496 4:-0.84047 5:0.048
|
102
|
+
-0.125 1:-0.716069 2:-0.592233 3:-0.883495 4:-0.953203 5:0.229333
|
103
|
+
-0.125 1:-0.843511 2:-0.642857 3:-0.904762 4:-0.968121 5:0.32
|
104
|
+
-0.125 1:-0.483092 2:-0.75 3:-0.75 4:-0.832636 5:0.168889
|
105
|
+
-0.125 1:-0.361538 2:-0.74359 3:-0.74359 4:-0.876408 5:0.32
|
106
|
+
-0.125 1:-0.547792 2:-0.690789 3:-0.828947 4:-0.880203 5:0.32
|
107
|
+
-0.02063505424396911 1:-0.505391 2:-0.757848 3:-0.820628 4:-0.865508 5:0.32
|
108
|
+
-0.125 1:-0.48954 2:-0.670251 3:-0.784946 4:-0.890584 5:0.184
|
109
|
+
-0.125 1:-0.393393 2:-0.821429 3:-0.785714 4:-0.890018 5:0.32
|
110
|
+
-0.125 1:-0.552613 2:-0.604061 3:-0.796954 4:-0.895334 5:0.32
|
111
|
+
-0.125 1:-0.56341 2:-0.704762 3:-0.809524 4:-0.848257 5:0.32
|
112
|
+
-0.125 1:-0.543767 2:-0.743119 3:-0.832241 4:-0.880674 5:0.32
|
113
|
+
-0.125 1:-0.494214 2:-0.720183 3:-0.724771 4:-0.851983 5:0.16
|
114
|
+
-0.125 1:-0.566851 2:-0.662269 3:-0.82058 4:-0.842439 5:0.32
|
115
|
+
-0.125 1:-0.521058 2:-0.801829 3:-0.823171 4:-0.853454 5:0.32
|
116
|
+
-0.125 1:-0.464121 2:-0.779661 3:-0.737288 4:-0.838877 5:0.424615
|
117
|
+
-0.125 1:-0.476154 2:-0.825137 3:-0.759563 4:-0.847818 5:0.456
|
118
|
+
-0.125 1:-0.515772 2:-0.833333 3:-0.583333 4:-0.901441 5:0.32
|
119
|
+
-0.125 1:-0.527421 2:-0.730413 3:-0.814659 4:-0.894421 5:0.125714
|
120
|
+
-0.125 1:-0.524205 2:-0.818616 3:-0.809069 4:-0.880489 5:0.32
|
121
|
+
-0.125 1:-0.548023 2:-0.787234 3:-0.829787 4:-0.897445 5:0.125714
|
122
|
+
-0.125 1:-0.419355 2:-0.839286 3:-0.5 4:-0.856545 5:0.32
|
123
|
+
-0.125 1:-0.515738 2:-0.72093 3:-0.661734 4:-0.899228 5:0.0285714
|
124
|
+
-0.125 1:-0.537209 2:-0.811024 3:-0.811024 4:-0.877706 5:0.32
|
125
|
+
-0.125 1:-0.512427 2:-0.820175 3:-0.561404 4:-0.90839 5:0.125714
|
126
|
+
-0.125 1:-0.566514 2:-0.734375 3:-0.78125 4:-0.851464 5:0.32
|
127
|
+
-0.125 1:-0.581746 2:-0.670455 3:-0.818182 4:-0.893496 5:0.32
|
128
|
+
-0.125 1:-0.501896 2:-0.760825 3:-0.802062 4:-0.864176 5:0.529231
|
129
|
+
-0.125 1:-0.658537 2:-0.481481 3:-1 4:-0.821478 5:-0.845714
|
130
|
+
-0.125 1:-0.728738 2:-0.552426 3:-0.862285 4:-0.92373 5:0.32
|
131
|
+
-0.125 1:-0.554046 2:-0.572118 3:-0.789812 4:-0.859719 5:0.32
|
132
|
+
-0.125 1:-0.515097 2:-0.566098 3:-0.855011 4:-0.830994 5:0.32
|
133
|
+
-0.125 1:-0.603835 2:-0.577181 3:-0.704698 4:-0.875993 5:-0.174545
|
134
|
+
-0.125 1:-0.57637 2:-0.550278 3:-0.777665 4:-0.875919 5:0.32
|
135
|
+
-0.125 1:-0.496503 2:-0.826087 3:-0.826087 4:-0.930144 5:0.32
|
136
|
+
-0.125 1:-0.469146 2:-0.714912 3:-0.640351 4:-0.897233 5:0.222857
|
data/lib/#language.rb#
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
require_relative "names"
|
2
|
+
|
3
|
+
module PdfExtract::Language
|
4
|
+
|
5
|
+
def self.transliterate s
|
6
|
+
s = s.gsub "\ufb01", "fi"
|
7
|
+
s = s.gsub "\ufb02", "fl"
|
8
|
+
s = s.gsub "\ufb03", "ffi"
|
9
|
+
s = s.gsub "\ufb04", "ffl"
|
10
|
+
s = s.gsub "\ufb06", "st"
|
11
|
+
s = s.gsub "\u2018", "'"
|
12
|
+
s = s.gsub "\u2019", "'"
|
13
|
+
s = s.gsub "\u2013", "-"
|
14
|
+
s = s.gsub "\u2014", "-"
|
15
|
+
s = s.gsub "\u201c", "\""
|
16
|
+
s = s.gsub "\u201d", "\""
|
17
|
+
s = s.gsub "\u25af", "("
|
18
|
+
s = s.gsub "\u00b4", ""
|
19
|
+
s = s.gsub "\u00b1", "-"
|
20
|
+
|
21
|
+
s = s.gsub /\s+/, " "
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.letter_ratio s
|
25
|
+
s.count("A-Z0-9\-[],.\"'()") / s.length.to_f
|
26
|
+
end
|
27
|
+
|
28
|
+
# TODO Ignore caps in middle of words
|
29
|
+
def self.cap_ratio s
|
30
|
+
sentence_end = true
|
31
|
+
cap_count = 0
|
32
|
+
|
33
|
+
s.each_char do |c|
|
34
|
+
if c =~ /\./
|
35
|
+
sentence_end = true
|
36
|
+
elsif c =~ /[A-Z]/
|
37
|
+
cap_count = cap_count + 1 unless sentence_end
|
38
|
+
sentence_end = false
|
39
|
+
elsif c =~ /[^\s]/
|
40
|
+
sentence_end = false
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
cap_count / s.split.length.to_f
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.year_ratio s
|
48
|
+
words = s.split
|
49
|
+
|
50
|
+
year_words = words.map do |word|
|
51
|
+
word =~ /[^\d]\d{4}[^\d]/
|
52
|
+
end
|
53
|
+
|
54
|
+
year_words.reject { |year_word| not year_word }.length / words.length.to_f
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.name_ratio content
|
58
|
+
PdfExtract::Names.detect_names(content)[:name_frequency]
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.word_count s
|
62
|
+
s.split.count
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
data/lib/analysis/sections.rb
CHANGED
@@ -138,21 +138,15 @@ module PdfExtract
|
|
138
138
|
sections = add_content_stats sections, pages.keys.count
|
139
139
|
|
140
140
|
# Score sections into categories based on their textual attributes.
|
141
|
-
|
142
|
-
:
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
},
|
148
|
-
:body => {
|
149
|
-
:name_ratio => [0.03, 1],
|
150
|
-
:letter_ratio => [0.1, 1],
|
151
|
-
:year_ratio => [0.0, 1]
|
152
|
-
}
|
141
|
+
ref_ideals = {
|
142
|
+
:name_ratio => [0.14, 1],
|
143
|
+
:letter_ratio => [0.23, 6],
|
144
|
+
:year_ratio => [0.05, 10],
|
145
|
+
:cap_ratio => [0.49, 10],
|
146
|
+
:lateness => [0.96, 6]
|
153
147
|
}
|
154
148
|
|
155
|
-
Spatial.score(sections,
|
149
|
+
Spatial.score(sections, ref_ideals, :reference_score)
|
156
150
|
|
157
151
|
sections
|
158
152
|
end
|
data/lib/language.rb
CHANGED
@@ -5,12 +5,21 @@ module PdfExtract::Language
|
|
5
5
|
def self.transliterate s
|
6
6
|
s = s.gsub "\ufb01", "fi"
|
7
7
|
s = s.gsub "\ufb02", "fl"
|
8
|
+
s = s.gsub "\ufb03", "ffi"
|
9
|
+
s = s.gsub "\ufb04", "ffl"
|
10
|
+
s = s.gsub "\ufb06", "st"
|
8
11
|
s = s.gsub "\u2018", "'"
|
9
12
|
s = s.gsub "\u2019", "'"
|
10
13
|
s = s.gsub "\u2013", "-"
|
14
|
+
s = s.gsub "\u2014", "-"
|
11
15
|
s = s.gsub "\u201c", "\""
|
12
16
|
s = s.gsub "\u201d", "\""
|
13
|
-
s
|
17
|
+
s = s.gsub "\u25af", "("
|
18
|
+
s = s.gsub "\u00b4", ""
|
19
|
+
s = s.gsub "\u00b1", "-"
|
20
|
+
|
21
|
+
|
22
|
+
s = s.gsub /\s+/, " "
|
14
23
|
end
|
15
24
|
|
16
25
|
def self.letter_ratio s
|
@@ -40,7 +49,7 @@ module PdfExtract::Language
|
|
40
49
|
words = s.split
|
41
50
|
|
42
51
|
year_words = words.map do |word|
|
43
|
-
word =~
|
52
|
+
word =~ /[^\d]\d{4}[^\d]/
|
44
53
|
end
|
45
54
|
|
46
55
|
year_words.reject { |year_word| not year_word }.length / words.length.to_f
|
data/lib/model/characters.rb
CHANGED
@@ -49,8 +49,17 @@ module PdfExtract
|
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
52
|
+
def self.find_media_box page, objects
|
53
|
+
if page[:MediaBox]
|
54
|
+
page[:MediaBox]
|
55
|
+
elsif page[:Parent]
|
56
|
+
find_media_box objects[page[:Parent]], objects
|
57
|
+
else
|
58
|
+
[0, 0, 0, 0]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
52
62
|
def self.make_text_runs text, tj, state, render_state, page, page_number
|
53
|
-
# TODO Ignore chars outside the page :MediaBox.
|
54
63
|
# TODO Mul UserUnit if specified by page.
|
55
64
|
# TODO Include writing mode, so that runs can be joined either
|
56
65
|
# virtically or horizontally in the join stage.
|
@@ -87,6 +96,8 @@ module PdfExtract
|
|
87
96
|
|
88
97
|
px = bl_pos.row(0)[0]
|
89
98
|
py = bl_pos.row(0)[1]
|
99
|
+
|
100
|
+
media_box = find_media_box(page.page_object, page.objects)
|
90
101
|
|
91
102
|
objs << {
|
92
103
|
:x => px,
|
@@ -97,8 +108,8 @@ module PdfExtract
|
|
97
108
|
:content => state.last[:font].to_utf8(c),
|
98
109
|
:page => page_number,
|
99
110
|
:font => state.last[:font].basefont,
|
100
|
-
:page_width =>
|
101
|
-
:page_height =>
|
111
|
+
:page_width => media_box[2] - media_box[0],
|
112
|
+
:page_height => media_box[3] - media_box[1]
|
102
113
|
}
|
103
114
|
|
104
115
|
disp_x, disp_y = glyph_displacement(c, state)
|
@@ -114,6 +125,17 @@ module PdfExtract
|
|
114
125
|
objs
|
115
126
|
end
|
116
127
|
|
128
|
+
def self.build_fonts page
|
129
|
+
fonts = {}
|
130
|
+
font_metrics = {}
|
131
|
+
page.fonts.each do |label, ref|
|
132
|
+
font = PDF::Reader::Font.new(page.objects, page.objects[ref])
|
133
|
+
fonts[label] = font
|
134
|
+
font_metrics[label] = FontMetrics.new font
|
135
|
+
end
|
136
|
+
[fonts, font_metrics]
|
137
|
+
end
|
138
|
+
|
117
139
|
def self.include_in pdf
|
118
140
|
|
119
141
|
pdf.spatials :characters do |parser|
|
@@ -127,15 +149,19 @@ module PdfExtract
|
|
127
149
|
:tlm => Matrix.identity(3)
|
128
150
|
}
|
129
151
|
|
130
|
-
parser.for :resource_font do |data|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
152
|
+
# parser.for :resource_font do |data|
|
153
|
+
# puts data
|
154
|
+
# fonts[data[0]] = data[1]
|
155
|
+
# font_metrics[data[0]] = FontMetrics.new data[1]
|
156
|
+
# nil
|
157
|
+
# end
|
135
158
|
|
136
159
|
parser.for :begin_page do |data|
|
137
160
|
page = data[0]
|
138
161
|
page_n = page_n.next
|
162
|
+
|
163
|
+
fonts, font_metrics = build_fonts page
|
164
|
+
|
139
165
|
state << {
|
140
166
|
:h_scale => 100,
|
141
167
|
:char_spacing => 0,
|
@@ -270,7 +296,7 @@ module PdfExtract
|
|
270
296
|
|
271
297
|
# Show text operators.
|
272
298
|
|
273
|
-
parser.for :
|
299
|
+
parser.for :set_spacing_next_line_show_text do |data|
|
274
300
|
state.last[:word_spacing] = data[0]
|
275
301
|
state.last[:char_spacing] = data[1]
|
276
302
|
|
@@ -282,7 +308,7 @@ module PdfExtract
|
|
282
308
|
make_text_runs data[2], 0, state, render_state, page, page_n
|
283
309
|
end
|
284
310
|
|
285
|
-
parser.for :
|
311
|
+
parser.for :move_to_next_line_and_show_text do |data|
|
286
312
|
render_state[:tm] = Matrix[
|
287
313
|
[1, 0, 0], [0, 1, 0], [0, -state.last[:leading], 1]
|
288
314
|
] * render_state[:tlm]
|
@@ -291,11 +317,11 @@ module PdfExtract
|
|
291
317
|
make_text_runs data.first, 0, state, render_state, page, page_n
|
292
318
|
end
|
293
319
|
|
294
|
-
parser.for :
|
320
|
+
parser.for :show_text do |data|
|
295
321
|
make_text_runs data.first, 0, state, render_state, page, page_n
|
296
322
|
end
|
297
323
|
|
298
|
-
parser.for :
|
324
|
+
parser.for :show_text_with_positioning do |data|
|
299
325
|
data = data.first
|
300
326
|
runs = []
|
301
327
|
tj = 0
|
data/lib/pdf.rb
CHANGED
@@ -129,9 +129,9 @@ module PdfExtract
|
|
129
129
|
end
|
130
130
|
|
131
131
|
paged_objs.each_pair do |page, objs|
|
132
|
-
|
132
|
+
call_before
|
133
133
|
|
134
|
-
if
|
134
|
+
if object_calls?
|
135
135
|
@object_listeners.each_pair do |type, listeners|
|
136
136
|
listeners.each do |listener|
|
137
137
|
if objs[type].nil?
|
@@ -142,22 +142,29 @@ module PdfExtract
|
|
142
142
|
end
|
143
143
|
end
|
144
144
|
|
145
|
-
|
145
|
+
call_after
|
146
146
|
end
|
147
147
|
|
148
148
|
else
|
149
149
|
|
150
|
-
|
151
|
-
if
|
152
|
-
|
150
|
+
call_before
|
151
|
+
if object_calls?
|
152
|
+
call_object_listeners @pdf.spatial_objects
|
153
153
|
end
|
154
|
-
|
154
|
+
call_after
|
155
155
|
|
156
156
|
end
|
157
157
|
|
158
|
-
if
|
159
|
-
|
160
|
-
PDF::Reader.file filename, self, :raw_text => true
|
158
|
+
if for_calls?
|
159
|
+
expand_listeners_to_callback_methods
|
160
|
+
#PDF::Reader.file filename, self, :raw_text => true
|
161
|
+
|
162
|
+
reader = PDF::Reader.new filename, :raw_text => true
|
163
|
+
reader.pages.each do |page|
|
164
|
+
begin_page page
|
165
|
+
page.walk self
|
166
|
+
end_page page
|
167
|
+
end
|
161
168
|
end
|
162
169
|
end
|
163
170
|
|
@@ -1,11 +1,14 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
require_relative "../spatial"
|
3
|
+
require_relative "score"
|
2
4
|
|
3
5
|
module PdfExtract
|
4
6
|
module References
|
5
|
-
|
6
|
-
Settings.default :
|
7
|
+
|
8
|
+
Settings.default :reference_flex, 0.1
|
7
9
|
Settings.default :min_sequence_count, 3
|
8
10
|
Settings.default :max_reference_order, 1000
|
11
|
+
Settings.default :min_lateness , 0.5
|
9
12
|
|
10
13
|
def self.partition_by ary, &block
|
11
14
|
matching = []
|
@@ -159,23 +162,44 @@ module PdfExtract
|
|
159
162
|
def self.include_in pdf
|
160
163
|
pdf.spatials :references, :depends_on => [:sections] do |parser|
|
161
164
|
|
162
|
-
|
165
|
+
sections = []
|
163
166
|
|
164
167
|
parser.objects :sections do |section|
|
165
|
-
|
166
|
-
if section[:reference_score] >= pdf.settings[:min_score]
|
167
|
-
if numeric_sequence? pdf, Spatial.get_text_content(section)
|
168
|
-
refs += split_by_delimiter pdf, Spatial.get_text_content(section)
|
169
|
-
elsif multi_margin? section[:lines]
|
170
|
-
refs += split_by_margin section[:lines]
|
171
|
-
elsif multi_spacing? section[:lines]
|
172
|
-
refs += split_by_line_spacing section[:lines]
|
173
|
-
end
|
174
|
-
end
|
168
|
+
sections << section
|
175
169
|
end
|
176
170
|
|
177
171
|
parser.after do
|
178
|
-
|
172
|
+
max_score = sections.map {|s| s[:reference_score]}.max
|
173
|
+
min_permittable = max_score - (max_score * pdf.settings[:reference_flex])
|
174
|
+
|
175
|
+
refs = []
|
176
|
+
|
177
|
+
sections = sections.reject do |s|
|
178
|
+
# A section without any years is definitely not a list of
|
179
|
+
# references. So too a section that appears in the first
|
180
|
+
# half of an article.
|
181
|
+
s[:lateness] < pdf.settings[:min_lateness] || s[:year_ratio].zero?
|
182
|
+
end
|
183
|
+
|
184
|
+
sections.each do |section|
|
185
|
+
if section[:reference_score] >= min_permittable
|
186
|
+
if numeric_sequence? pdf, Spatial.get_text_content(section)
|
187
|
+
refs += split_by_delimiter pdf, Spatial.get_text_content(section)
|
188
|
+
elsif multi_margin? section[:lines]
|
189
|
+
refs += split_by_margin section[:lines]
|
190
|
+
elsif multi_spacing? section[:lines]
|
191
|
+
refs += split_by_line_spacing section[:lines]
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
# TODO Ideally we wouldn't see the ref headers here.
|
197
|
+
# Unfortunately publication details can look a lot like references.
|
198
|
+
refs.reject do |ref|
|
199
|
+
norm = ref[:content].downcase.strip
|
200
|
+
norm =~ /references?/ || norm =~ /submitted for publication/ || norm =~ /additional contributions/
|
201
|
+
end
|
202
|
+
|
179
203
|
end
|
180
204
|
|
181
205
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require "svm"
|
2
|
+
|
3
|
+
module PdfExtract
|
4
|
+
module Score
|
5
|
+
|
6
|
+
def self.path_to_data data_filename
|
7
|
+
File.join(File.dirname(File.expand_path(__FILE__)), "../../data/" + data_filename)
|
8
|
+
end
|
9
|
+
|
10
|
+
@@reference_model = Model.new(path_to_data("reference.model"))
|
11
|
+
|
12
|
+
def self.reference? section
|
13
|
+
sample = {
|
14
|
+
1 => section[:letter_ratio],
|
15
|
+
2 => section[:name_ratio],
|
16
|
+
3 => section[:year_ratio],
|
17
|
+
4 => section[:cap_ratio],
|
18
|
+
5 => section[:lateness]
|
19
|
+
}
|
20
|
+
|
21
|
+
puts sample
|
22
|
+
|
23
|
+
puts @@reference_model.predict(sample)
|
24
|
+
@@reference_model.predict(sample) == 1
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
data/lib/spatial.rb
CHANGED
@@ -153,42 +153,23 @@ module PdfExtract
|
|
153
153
|
(b_top <= a_top && b_top >= a[from]) || (b[from] >= a[from] && b[from] <= b_top)
|
154
154
|
end
|
155
155
|
|
156
|
-
def self.score items, ideals
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
types.each do |name, vars|
|
163
|
-
score_name = (name.to_s + "_score").to_sym
|
156
|
+
def self.score items, ideals, name
|
157
|
+
ideals.keys.each do |f|
|
158
|
+
diffs = items.map {|item| (item[f] - ideals[f][0]).abs}
|
159
|
+
diffs.map! {|d| d.nan? ? 1 : d}
|
160
|
+
max_diff = diffs.max
|
164
161
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
if diff.zero?
|
171
|
-
scores << 1.0
|
172
|
-
else
|
173
|
-
s = 1.0 / diff
|
174
|
-
if not s.finite?
|
175
|
-
scores << 0.0
|
176
|
-
else
|
177
|
-
scores << s
|
178
|
-
end
|
179
|
-
end
|
180
|
-
end
|
181
|
-
|
182
|
-
score_max = scores.max
|
183
|
-
weighted_scores = scores.map do |score|
|
184
|
-
(score / score_max) * ideals[name][var_name][1]
|
162
|
+
scores = diffs.map do |d|
|
163
|
+
if d == 0
|
164
|
+
ideals[f][1]
|
165
|
+
else
|
166
|
+
(1 - (d / max_diff)) * ideals[f][1]
|
185
167
|
end
|
168
|
+
end
|
186
169
|
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
end
|
191
|
-
|
170
|
+
items.each_index do |i|
|
171
|
+
items[i][name] ||= 0
|
172
|
+
items[i][name] = items[i][name] + scores[i]
|
192
173
|
end
|
193
174
|
end
|
194
175
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 7
|
9
|
+
version: 0.0.7
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Karl Jonathan Ward
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-11-
|
17
|
+
date: 2011-11-09 00:00:00 +00:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -23,14 +23,13 @@ dependencies:
|
|
23
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
24
24
|
none: false
|
25
25
|
requirements:
|
26
|
-
- - "
|
26
|
+
- - ">="
|
27
27
|
- !ruby/object:Gem::Version
|
28
28
|
segments:
|
29
|
-
- 1
|
30
29
|
- 0
|
31
|
-
-
|
32
|
-
-
|
33
|
-
version:
|
30
|
+
- 10
|
31
|
+
- 1
|
32
|
+
version: 0.10.1
|
34
33
|
type: :runtime
|
35
34
|
version_requirements: *id001
|
36
35
|
- !ruby/object:Gem::Dependency
|
@@ -146,6 +145,7 @@ files:
|
|
146
145
|
- bin/some6.mask.pdf
|
147
146
|
- bin/train.rb
|
148
147
|
- bin/two-column.mask.pdf
|
148
|
+
- lib/#language.rb#
|
149
149
|
- lib/analysis/columns.rb
|
150
150
|
- lib/analysis/margins.rb
|
151
151
|
- lib/analysis/sections.rb
|
@@ -164,12 +164,14 @@ files:
|
|
164
164
|
- lib/references/references.rb
|
165
165
|
- lib/references/resolve.rb
|
166
166
|
- lib/references/resolved_references.rb
|
167
|
+
- lib/references/score.rb
|
167
168
|
- lib/spatial.rb
|
168
169
|
- lib/view/abstract_view.rb
|
169
170
|
- lib/view/pdf_view.rb
|
170
171
|
- lib/view/png_view.rb
|
171
172
|
- lib/view/xml_view.rb
|
172
173
|
- data/familynames.db
|
174
|
+
- data/reference.model
|
173
175
|
- data/stopwords.txt
|
174
176
|
has_rdoc: true
|
175
177
|
homepage: http://github.com/CrossRef/pdfextract
|