anystyle 1.1.0 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/HISTORY.md +9 -0
- data/lib/anystyle.rb +2 -0
- data/lib/anystyle/document.rb +61 -127
- data/lib/anystyle/feature/line.rb +23 -8
- data/lib/anystyle/feature/ref.rb +4 -4
- data/lib/anystyle/finder.rb +4 -4
- data/lib/anystyle/normalizer/locale.rb +17 -6
- data/lib/anystyle/normalizer/names.rb +1 -1
- data/lib/anystyle/page.rb +50 -0
- data/lib/anystyle/refs.rb +244 -0
- data/lib/anystyle/support/finder.mod +5972 -3461
- data/lib/anystyle/support/finder.txt +94 -72
- data/lib/anystyle/support/parser.mod +12876 -12387
- data/lib/anystyle/utils.rb +49 -5
- data/lib/anystyle/version.rb +1 -1
- data/res/finder/bb132pr2055.ttx +20 -20
- data/res/finder/bb408gp7470.ttx +3919 -0
- data/res/finder/bb599nz4341.ttx +5 -5
- data/res/finder/bb725rt6501.ttx +5 -5
- data/res/finder/bc605xz1554.ttx +40 -40
- data/res/finder/bd040gx5718.ttx +15 -15
- data/res/finder/bd413nt2715.ttx +46 -46
- data/res/finder/bf668vw2021.ttx +7 -7
- data/res/finder/bg495cx0468.ttx +19 -19
- data/res/finder/bg599vt3743.ttx +6 -6
- data/res/finder/bg608dx2253.ttx +3 -3
- data/res/finder/bh410qk3771.ttx +23 -23
- data/res/finder/bh989ww6442.ttx +33 -33
- data/res/finder/bj581pc8202.ttx +2 -2
- data/res/parser/core.xml +47 -0
- data/res/parser/gold.xml +59 -8
- metadata +6 -4
- data/res/finder/bb550sh8053.ttx +0 -18660
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 3b74980250ee2a9627b97a9f91e6c11052ddd1e0
|
4
|
+
data.tar.gz: 645f6371761078aa5d4b49c0905a749eda55dd3b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 449c2ffca67851254e4e9a98fa5618a57928147e3fda613b908ab83bc8d4d096cec7f11bae9f7727fbcad69dfac73d6f4f1034f00cece3c30a7813bfc7c2c20e
|
7
|
+
data.tar.gz: c14d4637d11689eb88727baabcdc30c672642d93dad810c15b86e2197835c75360c21ccf03bbdf9da3e909e0f0777d55f9ef81218700231db8ea5504051f8699
|
data/HISTORY.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
1.2.0 / 2018-08-16
|
2
|
+
==================
|
3
|
+
* Added check and train commands to CLI.
|
4
|
+
* Added --no-solo and --crop flags to find command.
|
5
|
+
* Added reference block normalizer.
|
6
|
+
* Added script detection normalizer.
|
7
|
+
* Improved Finder reference line joining.
|
8
|
+
* Improved Finder model; training sets.
|
9
|
+
|
1
10
|
1.1.0 / 2018-07-11
|
2
11
|
==================
|
3
12
|
* Improved Parser model; training sets.
|
data/lib/anystyle.rb
CHANGED
data/lib/anystyle/document.rb
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
module AnyStyle
|
2
2
|
class Document < Wapiti::Sequence
|
3
|
+
|
4
|
+
REFSECT = /references|referenzen|cited|bibliogra|secondary sources|literatur/i
|
5
|
+
|
3
6
|
class << self
|
4
|
-
include
|
7
|
+
include PDFUtils
|
5
8
|
|
6
9
|
def parse(string, delimiter: /\r?\n/, tagged: false)
|
7
10
|
current_label = ''
|
@@ -14,7 +17,7 @@ module AnyStyle
|
|
14
17
|
})
|
15
18
|
end
|
16
19
|
|
17
|
-
def open(path, format: File.extname(path), tagged: false,
|
20
|
+
def open(path, format: File.extname(path), tagged: false, **opts)
|
18
21
|
raise ArgumentError,
|
19
22
|
"cannot open tainted path: '#{path}'" if path.tainted?
|
20
23
|
raise ArgumentError,
|
@@ -26,7 +29,7 @@ module AnyStyle
|
|
26
29
|
when '.pdf'
|
27
30
|
meta = pdf_meta path if opts[:parse_meta]
|
28
31
|
info = pdf_info path if opts[:parse_info]
|
29
|
-
input = pdf_to_text path,
|
32
|
+
input = pdf_to_text path, **opts
|
30
33
|
when '.ttx'
|
31
34
|
tagged = true
|
32
35
|
input = File.read(path, encoding: 'utf-8')
|
@@ -47,8 +50,16 @@ module AnyStyle
|
|
47
50
|
attr_accessor :meta, :info, :path, :pages, :tokens
|
48
51
|
alias_method :lines, :tokens
|
49
52
|
|
53
|
+
def line_counts
|
54
|
+
@line_counts ||= Hash.new(0)
|
55
|
+
end
|
56
|
+
|
57
|
+
def nnum_counts
|
58
|
+
@nnum_counts ||= Hash.new(0)
|
59
|
+
end
|
60
|
+
|
50
61
|
def pages
|
51
|
-
@pages ||= Page.parse(lines)
|
62
|
+
@pages ||= Page.parse(lines, self)
|
52
63
|
end
|
53
64
|
|
54
65
|
def each
|
@@ -64,28 +75,34 @@ module AnyStyle
|
|
64
75
|
end
|
65
76
|
end
|
66
77
|
|
67
|
-
def each_section
|
78
|
+
def each_section(skip: ['meta'])
|
68
79
|
if block_given?
|
69
|
-
|
80
|
+
head = []
|
81
|
+
body = []
|
82
|
+
seen_content = false
|
83
|
+
|
70
84
|
lines.each do |ln|
|
71
85
|
case ln.label
|
72
86
|
when 'title'
|
73
|
-
|
74
|
-
yield
|
75
|
-
|
87
|
+
if seen_content
|
88
|
+
yield [head, body]
|
89
|
+
head, body, seen_content = [ln], [], false
|
90
|
+
else
|
91
|
+
head << ln
|
76
92
|
end
|
77
93
|
when 'ref', 'text'
|
78
|
-
|
94
|
+
body << ln
|
95
|
+
seen_content = true
|
79
96
|
else
|
80
|
-
|
97
|
+
body << ln unless skip.include?(ln.label)
|
81
98
|
end
|
82
99
|
end
|
83
|
-
unless
|
84
|
-
yield
|
100
|
+
unless head.empty?
|
101
|
+
yield [head, body]
|
85
102
|
end
|
86
103
|
self
|
87
104
|
else
|
88
|
-
to_enum
|
105
|
+
to_enum :each_section
|
89
106
|
end
|
90
107
|
end
|
91
108
|
|
@@ -94,7 +111,8 @@ module AnyStyle
|
|
94
111
|
doc.tokens = lines.map.with_index { |line, idx|
|
95
112
|
Wapiti::Token.new line.value,
|
96
113
|
label: other[idx].label.to_s,
|
97
|
-
observations: other[idx].observations.dup
|
114
|
+
observations: other[idx].observations.dup,
|
115
|
+
score: other[idx].score
|
98
116
|
}
|
99
117
|
doc
|
100
118
|
end
|
@@ -126,83 +144,45 @@ module AnyStyle
|
|
126
144
|
}
|
127
145
|
end
|
128
146
|
|
129
|
-
def references(**opts)
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
else
|
142
|
-
if join_refs?(current, val, delta, idt - indent)
|
143
|
-
current = join_refs(current, val)
|
144
|
-
else
|
145
|
-
bib << current
|
146
|
-
current, delta, indent = val, 0, idt
|
147
|
-
end
|
148
|
-
end
|
149
|
-
else
|
150
|
-
unless current.nil?
|
151
|
-
if delta > 15 || %w{ blank meta }.include?(ln.label)
|
152
|
-
delta += 1
|
153
|
-
else
|
154
|
-
bib << current
|
155
|
-
current, delta, indent = nil, 0, idt
|
147
|
+
def references(normalize_blocks: false, **opts)
|
148
|
+
if normalize_blocks
|
149
|
+
each_section.inject([]) do |refs, (head, body)|
|
150
|
+
rc = body.count { |tk| tk.label == 'ref' }
|
151
|
+
unless rc == 0
|
152
|
+
tc = body.count { |tk| tk.label == 'text' }
|
153
|
+
is_ref_sect = !head.find { |tk| tk.value =~ REFSECT }.nil?
|
154
|
+
|
155
|
+
# Skip sections with few ref lines!
|
156
|
+
if is_ref_sect || include_references?(rc, tc)
|
157
|
+
Refs.normalize! body, max_win_size: is_ref_sect ? 6 : 2
|
158
|
+
refs.concat Refs.parse(body).to_a
|
156
159
|
end
|
157
160
|
end
|
158
|
-
end
|
159
|
-
end
|
160
161
|
|
161
|
-
|
162
|
-
|
162
|
+
refs
|
163
|
+
end
|
164
|
+
else
|
165
|
+
Refs.parse(lines).to_a
|
163
166
|
end
|
164
|
-
|
165
|
-
bib
|
166
167
|
end
|
167
168
|
|
168
|
-
def
|
169
|
-
|
170
|
-
indent > 0,
|
171
|
-
delta == 0,
|
172
|
-
b.length < 50,
|
173
|
-
a.length < 65,
|
174
|
-
!!a.match(/[,;:&\p{Pd}]$/),
|
175
|
-
!!b.match(/^\p{Ll}/) || !!a.match(/\p{L}$/) && !!b.match(/^\p{L}/)
|
176
|
-
].count(true)
|
177
|
-
|
178
|
-
con = [
|
179
|
-
indent < 0,
|
180
|
-
delta > 8,
|
181
|
-
!!a.match(/\.\]$/),
|
182
|
-
a.length > 500,
|
183
|
-
(b.length - a.length) > 12,
|
184
|
-
!!b.match(/^(\p{Pd}\p{Pd}|\p{Lu}\p{Ll}+, \p{Lu}\.|\[\d)/)
|
185
|
-
].count(true)
|
186
|
-
|
187
|
-
(pro - con) > 1
|
169
|
+
def include_references?(rc, tc)
|
170
|
+
rc > 10 || (rc + tc) > 20 && (rc.to_f / tc) > 0.2
|
188
171
|
end
|
189
172
|
|
190
|
-
def
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
173
|
+
def sections(delimiter: "\n", spacer: ' ', **opts)
|
174
|
+
each_section.map do |(head, body)|
|
175
|
+
{
|
176
|
+
title: head.map { |tk|
|
177
|
+
display_chars(tk.value).lstrip.unicode_normalize
|
178
|
+
}.join(spacer),
|
179
|
+
text: body.map { |tk|
|
180
|
+
display_chars(tk.value).unicode_normalize
|
181
|
+
}.join(delimiter)
|
182
|
+
}
|
199
183
|
end
|
200
184
|
end
|
201
185
|
|
202
|
-
def sections(delimiter: "\n", **opts)
|
203
|
-
[]
|
204
|
-
end
|
205
|
-
|
206
186
|
def title(delimiter: " ", **opts)
|
207
187
|
lines.drop_while { |ln|
|
208
188
|
ln.label != 'title'
|
@@ -214,51 +194,5 @@ module AnyStyle
|
|
214
194
|
def inspect
|
215
195
|
"#<AnyStyle::Document lines={#{size}}>"
|
216
196
|
end
|
217
|
-
|
218
|
-
|
219
|
-
class Page
|
220
|
-
extend StringUtils
|
221
|
-
|
222
|
-
class << self
|
223
|
-
def parse(lines)
|
224
|
-
pages, current, width = [], [], 0
|
225
|
-
|
226
|
-
lines.each do |line|
|
227
|
-
if page_break?(line.value)
|
228
|
-
unless current.empty?
|
229
|
-
pages << new(current, width: width)
|
230
|
-
end
|
231
|
-
|
232
|
-
current = [line]
|
233
|
-
width = display_width(line.value)
|
234
|
-
else
|
235
|
-
current << line
|
236
|
-
width = [width, display_width(line.value)].max
|
237
|
-
end
|
238
|
-
end
|
239
|
-
|
240
|
-
unless current.empty?
|
241
|
-
pages << new(current, width: width)
|
242
|
-
end
|
243
|
-
|
244
|
-
pages
|
245
|
-
end
|
246
|
-
end
|
247
|
-
|
248
|
-
attr_accessor :lines, :width
|
249
|
-
|
250
|
-
def initialize(lines = [], width: 0)
|
251
|
-
@lines = lines
|
252
|
-
@width = width
|
253
|
-
end
|
254
|
-
|
255
|
-
def size
|
256
|
-
lines.size
|
257
|
-
end
|
258
|
-
|
259
|
-
def inspect
|
260
|
-
"#<AnyStyle::Document::Page size={#{size}} width={#{width}}>"
|
261
|
-
end
|
262
|
-
end
|
263
197
|
end
|
264
198
|
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module AnyStyle
|
2
2
|
class Feature
|
3
3
|
class Line < Feature
|
4
|
-
def observe(token, page:, **opts)
|
5
|
-
chars = display_chars(token)
|
4
|
+
def observe(token, page:, seq:, **opts)
|
5
|
+
chars = display_chars(token)
|
6
6
|
|
7
7
|
lttrs = count(chars, /\p{L}/)
|
8
8
|
upper = count(chars, /\p{Lu}/)
|
@@ -18,22 +18,37 @@ module AnyStyle
|
|
18
18
|
ratio(white, chars.length),
|
19
19
|
ratio(punct, chars.length),
|
20
20
|
ratio(width, page.width),
|
21
|
-
classify(chars)
|
21
|
+
classify(chars),
|
22
|
+
page_ratio(seq.line_counts[chars], seq.pages.length),
|
23
|
+
page_ratio(seq.nnum_counts[nnum(chars)], seq.pages.length)
|
22
24
|
]
|
23
25
|
end
|
24
26
|
|
25
27
|
def classify(chars)
|
26
|
-
case chars
|
27
|
-
when /\.\s*\.\s*\.\s
|
28
|
+
case chars.lstrip
|
29
|
+
when /\.\s*\.\s*\.\s*\.|……+/, /\p{L}\s{5,}\d+$/
|
28
30
|
:toc
|
29
|
-
when
|
30
|
-
:
|
31
|
-
when
|
31
|
+
when /^[\[\(]?\d+\.?[\]\)]?\s+\p{L}+/
|
32
|
+
:list
|
33
|
+
when /^(\p{Lu}\.?)\s*(\d+\.)+\s+\p{L}+/
|
34
|
+
:title
|
35
|
+
when /^(\w+\s)?(tab(le|elle|\.)|fig(ure|\.))/i
|
32
36
|
:cap
|
37
|
+
when /^\p{Pd}?\d+\p{Pd}?$/, /^[ivx]+$/i
|
38
|
+
:num
|
39
|
+
when /copyright|©|rights reserved/i
|
40
|
+
:copyright
|
41
|
+
when /https?:\/\//i
|
42
|
+
:http
|
33
43
|
else
|
34
44
|
:none
|
35
45
|
end
|
36
46
|
end
|
47
|
+
|
48
|
+
def page_ratio(a, b)
|
49
|
+
r = a.to_f / b
|
50
|
+
r == 1 ? '=' : r > 1 ? '+' : (r * 10).round
|
51
|
+
end
|
37
52
|
end
|
38
53
|
end
|
39
54
|
end
|
data/lib/anystyle/feature/ref.rb
CHANGED
@@ -3,8 +3,8 @@ module AnyStyle
|
|
3
3
|
class Ref < Feature
|
4
4
|
def observe(token, **opts)
|
5
5
|
[
|
6
|
-
symbolize(count(token, /\b(1
|
7
|
-
symbolize(count(token, /(\d[\(:;]\d)|(\d\s*\p{Pd}+\s*\d)|\bpp?\.|\bvols?\.|\
|
6
|
+
symbolize(count(token, /\b(1[4-9]|20)\d\d\b/)),
|
7
|
+
symbolize(count(token, /(\d[\(:;]\d)|(\d\s*\p{Pd}+\s*\d)|\bpp?\.|\bvols?\.|\b(nos?|nr|iss?|fasc)\.|n°|nº/i)),
|
8
8
|
symbolize(count(token, /\b\p{Lu}\./)),
|
9
9
|
symbolize(count(token, /\b(eds?\.|edited by|editors?|hg|hrsg|et al)\b/i)),
|
10
10
|
token =~ /^\s*(\[\w+\]|\(\d+\)|\d+\.)\s+/ ? 'T' : 'F'
|
@@ -13,8 +13,8 @@ module AnyStyle
|
|
13
13
|
|
14
14
|
def symbolize(k)
|
15
15
|
return '-' if k < 1
|
16
|
-
return '+' if k <
|
17
|
-
return '
|
16
|
+
return '+' if k < 2
|
17
|
+
return '*'
|
18
18
|
end
|
19
19
|
end
|
20
20
|
end
|
data/lib/anystyle/finder.rb
CHANGED
@@ -63,18 +63,18 @@ module AnyStyle
|
|
63
63
|
dataset.map { |doc| doc.references(**opts) }
|
64
64
|
end
|
65
65
|
|
66
|
-
def label(input, layout: true, **opts)
|
67
|
-
dataset = prepare(input, layout: layout, **opts)
|
66
|
+
def label(input, layout: true, crop: false, **opts)
|
67
|
+
dataset = prepare(input, layout: layout, crop: crop, **opts)
|
68
68
|
output = model.label(dataset, **opts)
|
69
69
|
Wapiti::Dataset.new(dataset.map.with_index { |doc, idx|
|
70
70
|
doc.label(output[idx])
|
71
71
|
})
|
72
72
|
end
|
73
73
|
|
74
|
-
def prepare(input, layout: true, **opts)
|
74
|
+
def prepare(input, layout: true, crop: false, **opts)
|
75
75
|
case input
|
76
76
|
when String
|
77
|
-
super(Document.open(input, layout: layout, **opts), **opts)
|
77
|
+
super(Document.open(input, layout: layout, crop: false, **opts), **opts)
|
78
78
|
when Array
|
79
79
|
super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **opts) }), **opts)
|
80
80
|
else
|
@@ -1,5 +1,6 @@
|
|
1
1
|
module AnyStyle
|
2
2
|
maybe_require 'language_detector'
|
3
|
+
maybe_require 'unicode/scripts'
|
3
4
|
|
4
5
|
class Normalizer
|
5
6
|
class Locale < Normalizer
|
@@ -8,23 +9,33 @@ module AnyStyle
|
|
8
9
|
end
|
9
10
|
|
10
11
|
def normalize(item, **opts)
|
11
|
-
return item if @ld.nil? || item.key?(:language)
|
12
|
-
|
13
12
|
sample = item.values_at(
|
14
13
|
:title,
|
15
14
|
:'container-title',
|
16
|
-
|
15
|
+
:'collection-title',
|
17
16
|
:location,
|
18
17
|
:journal,
|
19
|
-
:publisher
|
20
|
-
|
18
|
+
:publisher,
|
19
|
+
:note
|
21
20
|
).flatten.compact.join(' ')
|
22
21
|
|
23
22
|
return item if sample.empty?
|
24
23
|
|
25
|
-
|
24
|
+
language = detect_language(sample)
|
25
|
+
scripts = detect_scripts(sample)
|
26
|
+
|
27
|
+
item[:language] ||= language unless language.nil?
|
28
|
+
item[:scripts] ||= scripts unless scripts.nil?
|
26
29
|
item
|
27
30
|
end
|
28
31
|
end
|
32
|
+
|
33
|
+
def detect_language(string)
|
34
|
+
@ld.detect(string) unless @ld.nil?
|
35
|
+
end
|
36
|
+
|
37
|
+
def detect_scripts(string)
|
38
|
+
::Unicode::Scripts.scripts(string) if defined?(::Unicode::Scripts)
|
39
|
+
end
|
29
40
|
end
|
30
41
|
end
|