anystyle 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/HISTORY.md +9 -0
- data/lib/anystyle.rb +2 -0
- data/lib/anystyle/document.rb +61 -127
- data/lib/anystyle/feature/line.rb +23 -8
- data/lib/anystyle/feature/ref.rb +4 -4
- data/lib/anystyle/finder.rb +4 -4
- data/lib/anystyle/normalizer/locale.rb +17 -6
- data/lib/anystyle/normalizer/names.rb +1 -1
- data/lib/anystyle/page.rb +50 -0
- data/lib/anystyle/refs.rb +244 -0
- data/lib/anystyle/support/finder.mod +5972 -3461
- data/lib/anystyle/support/finder.txt +94 -72
- data/lib/anystyle/support/parser.mod +12876 -12387
- data/lib/anystyle/utils.rb +49 -5
- data/lib/anystyle/version.rb +1 -1
- data/res/finder/bb132pr2055.ttx +20 -20
- data/res/finder/bb408gp7470.ttx +3919 -0
- data/res/finder/bb599nz4341.ttx +5 -5
- data/res/finder/bb725rt6501.ttx +5 -5
- data/res/finder/bc605xz1554.ttx +40 -40
- data/res/finder/bd040gx5718.ttx +15 -15
- data/res/finder/bd413nt2715.ttx +46 -46
- data/res/finder/bf668vw2021.ttx +7 -7
- data/res/finder/bg495cx0468.ttx +19 -19
- data/res/finder/bg599vt3743.ttx +6 -6
- data/res/finder/bg608dx2253.ttx +3 -3
- data/res/finder/bh410qk3771.ttx +23 -23
- data/res/finder/bh989ww6442.ttx +33 -33
- data/res/finder/bj581pc8202.ttx +2 -2
- data/res/parser/core.xml +47 -0
- data/res/parser/gold.xml +59 -8
- metadata +6 -4
- data/res/finder/bb550sh8053.ttx +0 -18660
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 3b74980250ee2a9627b97a9f91e6c11052ddd1e0
|
4
|
+
data.tar.gz: 645f6371761078aa5d4b49c0905a749eda55dd3b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 449c2ffca67851254e4e9a98fa5618a57928147e3fda613b908ab83bc8d4d096cec7f11bae9f7727fbcad69dfac73d6f4f1034f00cece3c30a7813bfc7c2c20e
|
7
|
+
data.tar.gz: c14d4637d11689eb88727baabcdc30c672642d93dad810c15b86e2197835c75360c21ccf03bbdf9da3e909e0f0777d55f9ef81218700231db8ea5504051f8699
|
data/HISTORY.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
1.2.0 / 2018-08-16
|
2
|
+
==================
|
3
|
+
* Added check and train commands to CLI.
|
4
|
+
* Added --no-solo and --crop flags to find command.
|
5
|
+
* Added reference block normalizer.
|
6
|
+
* Added script detection normalizer.
|
7
|
+
* Improved Finder reference line joining.
|
8
|
+
* Improved Finder model; training sets.
|
9
|
+
|
1
10
|
1.1.0 / 2018-07-11
|
2
11
|
==================
|
3
12
|
* Improved Parser model; training sets.
|
data/lib/anystyle.rb
CHANGED
data/lib/anystyle/document.rb
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
module AnyStyle
|
2
2
|
class Document < Wapiti::Sequence
|
3
|
+
|
4
|
+
REFSECT = /references|referenzen|cited|bibliogra|secondary sources|literatur/i
|
5
|
+
|
3
6
|
class << self
|
4
|
-
include
|
7
|
+
include PDFUtils
|
5
8
|
|
6
9
|
def parse(string, delimiter: /\r?\n/, tagged: false)
|
7
10
|
current_label = ''
|
@@ -14,7 +17,7 @@ module AnyStyle
|
|
14
17
|
})
|
15
18
|
end
|
16
19
|
|
17
|
-
def open(path, format: File.extname(path), tagged: false,
|
20
|
+
def open(path, format: File.extname(path), tagged: false, **opts)
|
18
21
|
raise ArgumentError,
|
19
22
|
"cannot open tainted path: '#{path}'" if path.tainted?
|
20
23
|
raise ArgumentError,
|
@@ -26,7 +29,7 @@ module AnyStyle
|
|
26
29
|
when '.pdf'
|
27
30
|
meta = pdf_meta path if opts[:parse_meta]
|
28
31
|
info = pdf_info path if opts[:parse_info]
|
29
|
-
input = pdf_to_text path,
|
32
|
+
input = pdf_to_text path, **opts
|
30
33
|
when '.ttx'
|
31
34
|
tagged = true
|
32
35
|
input = File.read(path, encoding: 'utf-8')
|
@@ -47,8 +50,16 @@ module AnyStyle
|
|
47
50
|
attr_accessor :meta, :info, :path, :pages, :tokens
|
48
51
|
alias_method :lines, :tokens
|
49
52
|
|
53
|
+
def line_counts
|
54
|
+
@line_counts ||= Hash.new(0)
|
55
|
+
end
|
56
|
+
|
57
|
+
def nnum_counts
|
58
|
+
@nnum_counts ||= Hash.new(0)
|
59
|
+
end
|
60
|
+
|
50
61
|
def pages
|
51
|
-
@pages ||= Page.parse(lines)
|
62
|
+
@pages ||= Page.parse(lines, self)
|
52
63
|
end
|
53
64
|
|
54
65
|
def each
|
@@ -64,28 +75,34 @@ module AnyStyle
|
|
64
75
|
end
|
65
76
|
end
|
66
77
|
|
67
|
-
def each_section
|
78
|
+
def each_section(skip: ['meta'])
|
68
79
|
if block_given?
|
69
|
-
|
80
|
+
head = []
|
81
|
+
body = []
|
82
|
+
seen_content = false
|
83
|
+
|
70
84
|
lines.each do |ln|
|
71
85
|
case ln.label
|
72
86
|
when 'title'
|
73
|
-
|
74
|
-
yield
|
75
|
-
|
87
|
+
if seen_content
|
88
|
+
yield [head, body]
|
89
|
+
head, body, seen_content = [ln], [], false
|
90
|
+
else
|
91
|
+
head << ln
|
76
92
|
end
|
77
93
|
when 'ref', 'text'
|
78
|
-
|
94
|
+
body << ln
|
95
|
+
seen_content = true
|
79
96
|
else
|
80
|
-
|
97
|
+
body << ln unless skip.include?(ln.label)
|
81
98
|
end
|
82
99
|
end
|
83
|
-
unless
|
84
|
-
yield
|
100
|
+
unless head.empty?
|
101
|
+
yield [head, body]
|
85
102
|
end
|
86
103
|
self
|
87
104
|
else
|
88
|
-
to_enum
|
105
|
+
to_enum :each_section
|
89
106
|
end
|
90
107
|
end
|
91
108
|
|
@@ -94,7 +111,8 @@ module AnyStyle
|
|
94
111
|
doc.tokens = lines.map.with_index { |line, idx|
|
95
112
|
Wapiti::Token.new line.value,
|
96
113
|
label: other[idx].label.to_s,
|
97
|
-
observations: other[idx].observations.dup
|
114
|
+
observations: other[idx].observations.dup,
|
115
|
+
score: other[idx].score
|
98
116
|
}
|
99
117
|
doc
|
100
118
|
end
|
@@ -126,83 +144,45 @@ module AnyStyle
|
|
126
144
|
}
|
127
145
|
end
|
128
146
|
|
129
|
-
def references(**opts)
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
else
|
142
|
-
if join_refs?(current, val, delta, idt - indent)
|
143
|
-
current = join_refs(current, val)
|
144
|
-
else
|
145
|
-
bib << current
|
146
|
-
current, delta, indent = val, 0, idt
|
147
|
-
end
|
148
|
-
end
|
149
|
-
else
|
150
|
-
unless current.nil?
|
151
|
-
if delta > 15 || %w{ blank meta }.include?(ln.label)
|
152
|
-
delta += 1
|
153
|
-
else
|
154
|
-
bib << current
|
155
|
-
current, delta, indent = nil, 0, idt
|
147
|
+
def references(normalize_blocks: false, **opts)
|
148
|
+
if normalize_blocks
|
149
|
+
each_section.inject([]) do |refs, (head, body)|
|
150
|
+
rc = body.count { |tk| tk.label == 'ref' }
|
151
|
+
unless rc == 0
|
152
|
+
tc = body.count { |tk| tk.label == 'text' }
|
153
|
+
is_ref_sect = !head.find { |tk| tk.value =~ REFSECT }.nil?
|
154
|
+
|
155
|
+
# Skip sections with few ref lines!
|
156
|
+
if is_ref_sect || include_references?(rc, tc)
|
157
|
+
Refs.normalize! body, max_win_size: is_ref_sect ? 6 : 2
|
158
|
+
refs.concat Refs.parse(body).to_a
|
156
159
|
end
|
157
160
|
end
|
158
|
-
end
|
159
|
-
end
|
160
161
|
|
161
|
-
|
162
|
-
|
162
|
+
refs
|
163
|
+
end
|
164
|
+
else
|
165
|
+
Refs.parse(lines).to_a
|
163
166
|
end
|
164
|
-
|
165
|
-
bib
|
166
167
|
end
|
167
168
|
|
168
|
-
def
|
169
|
-
|
170
|
-
indent > 0,
|
171
|
-
delta == 0,
|
172
|
-
b.length < 50,
|
173
|
-
a.length < 65,
|
174
|
-
!!a.match(/[,;:&\p{Pd}]$/),
|
175
|
-
!!b.match(/^\p{Ll}/) || !!a.match(/\p{L}$/) && !!b.match(/^\p{L}/)
|
176
|
-
].count(true)
|
177
|
-
|
178
|
-
con = [
|
179
|
-
indent < 0,
|
180
|
-
delta > 8,
|
181
|
-
!!a.match(/\.\]$/),
|
182
|
-
a.length > 500,
|
183
|
-
(b.length - a.length) > 12,
|
184
|
-
!!b.match(/^(\p{Pd}\p{Pd}|\p{Lu}\p{Ll}+, \p{Lu}\.|\[\d)/)
|
185
|
-
].count(true)
|
186
|
-
|
187
|
-
(pro - con) > 1
|
169
|
+
def include_references?(rc, tc)
|
170
|
+
rc > 10 || (rc + tc) > 20 && (rc.to_f / tc) > 0.2
|
188
171
|
end
|
189
172
|
|
190
|
-
def
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
173
|
+
def sections(delimiter: "\n", spacer: ' ', **opts)
|
174
|
+
each_section.map do |(head, body)|
|
175
|
+
{
|
176
|
+
title: head.map { |tk|
|
177
|
+
display_chars(tk.value).lstrip.unicode_normalize
|
178
|
+
}.join(spacer),
|
179
|
+
text: body.map { |tk|
|
180
|
+
display_chars(tk.value).unicode_normalize
|
181
|
+
}.join(delimiter)
|
182
|
+
}
|
199
183
|
end
|
200
184
|
end
|
201
185
|
|
202
|
-
def sections(delimiter: "\n", **opts)
|
203
|
-
[]
|
204
|
-
end
|
205
|
-
|
206
186
|
def title(delimiter: " ", **opts)
|
207
187
|
lines.drop_while { |ln|
|
208
188
|
ln.label != 'title'
|
@@ -214,51 +194,5 @@ module AnyStyle
|
|
214
194
|
def inspect
|
215
195
|
"#<AnyStyle::Document lines={#{size}}>"
|
216
196
|
end
|
217
|
-
|
218
|
-
|
219
|
-
class Page
|
220
|
-
extend StringUtils
|
221
|
-
|
222
|
-
class << self
|
223
|
-
def parse(lines)
|
224
|
-
pages, current, width = [], [], 0
|
225
|
-
|
226
|
-
lines.each do |line|
|
227
|
-
if page_break?(line.value)
|
228
|
-
unless current.empty?
|
229
|
-
pages << new(current, width: width)
|
230
|
-
end
|
231
|
-
|
232
|
-
current = [line]
|
233
|
-
width = display_width(line.value)
|
234
|
-
else
|
235
|
-
current << line
|
236
|
-
width = [width, display_width(line.value)].max
|
237
|
-
end
|
238
|
-
end
|
239
|
-
|
240
|
-
unless current.empty?
|
241
|
-
pages << new(current, width: width)
|
242
|
-
end
|
243
|
-
|
244
|
-
pages
|
245
|
-
end
|
246
|
-
end
|
247
|
-
|
248
|
-
attr_accessor :lines, :width
|
249
|
-
|
250
|
-
def initialize(lines = [], width: 0)
|
251
|
-
@lines = lines
|
252
|
-
@width = width
|
253
|
-
end
|
254
|
-
|
255
|
-
def size
|
256
|
-
lines.size
|
257
|
-
end
|
258
|
-
|
259
|
-
def inspect
|
260
|
-
"#<AnyStyle::Document::Page size={#{size}} width={#{width}}>"
|
261
|
-
end
|
262
|
-
end
|
263
197
|
end
|
264
198
|
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module AnyStyle
|
2
2
|
class Feature
|
3
3
|
class Line < Feature
|
4
|
-
def observe(token, page:, **opts)
|
5
|
-
chars = display_chars(token)
|
4
|
+
def observe(token, page:, seq:, **opts)
|
5
|
+
chars = display_chars(token)
|
6
6
|
|
7
7
|
lttrs = count(chars, /\p{L}/)
|
8
8
|
upper = count(chars, /\p{Lu}/)
|
@@ -18,22 +18,37 @@ module AnyStyle
|
|
18
18
|
ratio(white, chars.length),
|
19
19
|
ratio(punct, chars.length),
|
20
20
|
ratio(width, page.width),
|
21
|
-
classify(chars)
|
21
|
+
classify(chars),
|
22
|
+
page_ratio(seq.line_counts[chars], seq.pages.length),
|
23
|
+
page_ratio(seq.nnum_counts[nnum(chars)], seq.pages.length)
|
22
24
|
]
|
23
25
|
end
|
24
26
|
|
25
27
|
def classify(chars)
|
26
|
-
case chars
|
27
|
-
when /\.\s*\.\s*\.\s
|
28
|
+
case chars.lstrip
|
29
|
+
when /\.\s*\.\s*\.\s*\.|……+/, /\p{L}\s{5,}\d+$/
|
28
30
|
:toc
|
29
|
-
when
|
30
|
-
:
|
31
|
-
when
|
31
|
+
when /^[\[\(]?\d+\.?[\]\)]?\s+\p{L}+/
|
32
|
+
:list
|
33
|
+
when /^(\p{Lu}\.?)\s*(\d+\.)+\s+\p{L}+/
|
34
|
+
:title
|
35
|
+
when /^(\w+\s)?(tab(le|elle|\.)|fig(ure|\.))/i
|
32
36
|
:cap
|
37
|
+
when /^\p{Pd}?\d+\p{Pd}?$/, /^[ivx]+$/i
|
38
|
+
:num
|
39
|
+
when /copyright|©|rights reserved/i
|
40
|
+
:copyright
|
41
|
+
when /https?:\/\//i
|
42
|
+
:http
|
33
43
|
else
|
34
44
|
:none
|
35
45
|
end
|
36
46
|
end
|
47
|
+
|
48
|
+
def page_ratio(a, b)
|
49
|
+
r = a.to_f / b
|
50
|
+
r == 1 ? '=' : r > 1 ? '+' : (r * 10).round
|
51
|
+
end
|
37
52
|
end
|
38
53
|
end
|
39
54
|
end
|
data/lib/anystyle/feature/ref.rb
CHANGED
@@ -3,8 +3,8 @@ module AnyStyle
|
|
3
3
|
class Ref < Feature
|
4
4
|
def observe(token, **opts)
|
5
5
|
[
|
6
|
-
symbolize(count(token, /\b(1
|
7
|
-
symbolize(count(token, /(\d[\(:;]\d)|(\d\s*\p{Pd}+\s*\d)|\bpp?\.|\bvols?\.|\
|
6
|
+
symbolize(count(token, /\b(1[4-9]|20)\d\d\b/)),
|
7
|
+
symbolize(count(token, /(\d[\(:;]\d)|(\d\s*\p{Pd}+\s*\d)|\bpp?\.|\bvols?\.|\b(nos?|nr|iss?|fasc)\.|n°|nº/i)),
|
8
8
|
symbolize(count(token, /\b\p{Lu}\./)),
|
9
9
|
symbolize(count(token, /\b(eds?\.|edited by|editors?|hg|hrsg|et al)\b/i)),
|
10
10
|
token =~ /^\s*(\[\w+\]|\(\d+\)|\d+\.)\s+/ ? 'T' : 'F'
|
@@ -13,8 +13,8 @@ module AnyStyle
|
|
13
13
|
|
14
14
|
def symbolize(k)
|
15
15
|
return '-' if k < 1
|
16
|
-
return '+' if k <
|
17
|
-
return '
|
16
|
+
return '+' if k < 2
|
17
|
+
return '*'
|
18
18
|
end
|
19
19
|
end
|
20
20
|
end
|
data/lib/anystyle/finder.rb
CHANGED
@@ -63,18 +63,18 @@ module AnyStyle
|
|
63
63
|
dataset.map { |doc| doc.references(**opts) }
|
64
64
|
end
|
65
65
|
|
66
|
-
def label(input, layout: true, **opts)
|
67
|
-
dataset = prepare(input, layout: layout, **opts)
|
66
|
+
def label(input, layout: true, crop: false, **opts)
|
67
|
+
dataset = prepare(input, layout: layout, crop: crop, **opts)
|
68
68
|
output = model.label(dataset, **opts)
|
69
69
|
Wapiti::Dataset.new(dataset.map.with_index { |doc, idx|
|
70
70
|
doc.label(output[idx])
|
71
71
|
})
|
72
72
|
end
|
73
73
|
|
74
|
-
def prepare(input, layout: true, **opts)
|
74
|
+
def prepare(input, layout: true, crop: false, **opts)
|
75
75
|
case input
|
76
76
|
when String
|
77
|
-
super(Document.open(input, layout: layout, **opts), **opts)
|
77
|
+
super(Document.open(input, layout: layout, crop: false, **opts), **opts)
|
78
78
|
when Array
|
79
79
|
super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **opts) }), **opts)
|
80
80
|
else
|
@@ -1,5 +1,6 @@
|
|
1
1
|
module AnyStyle
|
2
2
|
maybe_require 'language_detector'
|
3
|
+
maybe_require 'unicode/scripts'
|
3
4
|
|
4
5
|
class Normalizer
|
5
6
|
class Locale < Normalizer
|
@@ -8,23 +9,33 @@ module AnyStyle
|
|
8
9
|
end
|
9
10
|
|
10
11
|
def normalize(item, **opts)
|
11
|
-
return item if @ld.nil? || item.key?(:language)
|
12
|
-
|
13
12
|
sample = item.values_at(
|
14
13
|
:title,
|
15
14
|
:'container-title',
|
16
|
-
|
15
|
+
:'collection-title',
|
17
16
|
:location,
|
18
17
|
:journal,
|
19
|
-
:publisher
|
20
|
-
|
18
|
+
:publisher,
|
19
|
+
:note
|
21
20
|
).flatten.compact.join(' ')
|
22
21
|
|
23
22
|
return item if sample.empty?
|
24
23
|
|
25
|
-
|
24
|
+
language = detect_language(sample)
|
25
|
+
scripts = detect_scripts(sample)
|
26
|
+
|
27
|
+
item[:language] ||= language unless language.nil?
|
28
|
+
item[:scripts] ||= scripts unless scripts.nil?
|
26
29
|
item
|
27
30
|
end
|
28
31
|
end
|
32
|
+
|
33
|
+
def detect_language(string)
|
34
|
+
@ld.detect(string) unless @ld.nil?
|
35
|
+
end
|
36
|
+
|
37
|
+
def detect_scripts(string)
|
38
|
+
::Unicode::Scripts.scripts(string) if defined?(::Unicode::Scripts)
|
39
|
+
end
|
29
40
|
end
|
30
41
|
end
|