anystyle 1.2.1 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/HISTORY.md +8 -0
- data/lib/anystyle.rb +1 -0
- data/lib/anystyle/document.rb +2 -2
- data/lib/anystyle/feature/keyword.rb +23 -7
- data/lib/anystyle/feature/line.rb +1 -1
- data/lib/anystyle/feature/quotes.rb +4 -4
- data/lib/anystyle/feature/words.rb +4 -0
- data/lib/anystyle/finder.rb +13 -4
- data/lib/anystyle/format/csl.rb +17 -0
- data/lib/anystyle/normalizer/arxiv.rb +15 -0
- data/lib/anystyle/normalizer/container.rb +5 -1
- data/lib/anystyle/normalizer/locator.rb +9 -1
- data/lib/anystyle/normalizer/names.rb +1 -1
- data/lib/anystyle/normalizer/punctuation.rb +4 -2
- data/lib/anystyle/parser.rb +2 -1
- data/lib/anystyle/refs.rb +41 -20
- data/lib/anystyle/support/finder.mod +7089 -5838
- data/lib/anystyle/support/parser.mod +17212 -12817
- data/lib/anystyle/utils.rb +11 -7
- data/lib/anystyle/version.rb +1 -1
- data/res/finder/bb132pr2055.ttx +3 -3
- data/res/finder/bb408gp7470.ttx +3 -3
- data/res/finder/bc605xz1554.ttx +1 -1
- data/res/finder/bd040gx5718.ttx +3 -3
- data/res/finder/bd413nt2715.ttx +7 -7
- data/res/finder/bg599vt3743.ttx +2 -2
- data/res/parser/bad.xml +0 -55
- data/res/parser/core.xml +2113 -155
- data/res/parser/gold.xml +10428 -7117
- data/res/parser/good.xml +2 -1048
- data/res/parser/ugly.xml +2 -1393
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 4a8c6471369e8969b190536c6f597742e6917197ce26009b1f7c7dcd1ec32168
|
4
|
+
data.tar.gz: 1f9af1e1337c47fda651b40fd19903d7474fe860b679bcafd832b932fc075947
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 504a133a3cefedeb24fc9c165e00c9f59f6dfee14b78fa0206b9eadb2d060e236b8e99fe211d94d8df1de9d9c93efc8d3149c931827c5df83bfa56524539ce55
|
7
|
+
data.tar.gz: 70a9e1c156fe996980610755971eb4feb899afd2a57e20cbd95c664618326385b88ef8c733268922977a7252b2fff47771ba69892f37eecc94227545c4e956b3
|
data/HISTORY.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
1.3.0 / 2018-09-18
|
2
|
+
==================
|
3
|
+
* Updated and improved normalizers and CSL format.
|
4
|
+
* Improved Chinese reference tokenization.
|
5
|
+
* Added option to customizee pdftotext path.
|
6
|
+
* Improved Finder reference line joining.
|
7
|
+
* Improved Finder model; training sets.
|
8
|
+
* Improved Parser model; training sets.
|
1
9
|
1.2.1 / 2018-08-17
|
2
10
|
==================
|
3
11
|
* Added check and train commands to CLI.
|
data/lib/anystyle.rb
CHANGED
@@ -28,6 +28,7 @@ require 'anystyle/feature/terminal'
|
|
28
28
|
require 'anystyle/feature/words'
|
29
29
|
|
30
30
|
require 'anystyle/normalizer'
|
31
|
+
require 'anystyle/normalizer/arxiv'
|
31
32
|
require 'anystyle/normalizer/brackets'
|
32
33
|
require 'anystyle/normalizer/container'
|
33
34
|
require 'anystyle/normalizer/date'
|
data/lib/anystyle/document.rb
CHANGED
@@ -27,8 +27,8 @@ module AnyStyle
|
|
27
27
|
|
28
28
|
case format.downcase
|
29
29
|
when '.pdf'
|
30
|
-
meta = pdf_meta path if opts[:parse_meta]
|
31
|
-
info = pdf_info path if opts[:parse_info]
|
30
|
+
meta = pdf_meta path, **opts if opts[:parse_meta]
|
31
|
+
info = pdf_info path, **opts if opts[:parse_info]
|
32
32
|
input = pdf_to_text path, **opts
|
33
33
|
when '.ttx'
|
34
34
|
tagged = true
|
@@ -9,11 +9,15 @@ module AnyStyle
|
|
9
9
|
case alpha
|
10
10
|
when /^ed(s|itors?|ited?|iteurs?)?$/i,
|
11
11
|
/^(hg|hrsg|herausgeber)$/i,
|
12
|
-
/^(compilador)$/i
|
12
|
+
/^(compilador)$/i,
|
13
|
+
/編/
|
13
14
|
:editor
|
15
|
+
when /著|撰/,
|
16
|
+
:author
|
14
17
|
when /^trans(l(ated|ators?|ation))?$/i,
|
15
18
|
/^übers(etz(t|ung))?$/i,
|
16
|
-
/^trad(uction|ucteurs?|uit)?$/i
|
19
|
+
/^trad(uction|ucteurs?|uit)?$/i,
|
20
|
+
/譯/
|
17
21
|
:translator
|
18
22
|
when /^(dissertation|thesis)$/i
|
19
23
|
:thesis
|
@@ -21,7 +25,7 @@ module AnyStyle
|
|
21
25
|
:proceedings
|
22
26
|
when /^(Journal|Zeitschrift|Quarterly|Magazine?|Times|Rev(iew|vue)?|Bulletin|News|Week|Gazett[ea])/
|
23
27
|
:journal
|
24
|
-
when /^in$/i
|
28
|
+
when /^in$/i, /收入/
|
25
29
|
:in
|
26
30
|
when /^([AaUu]nd|y|e)$/
|
27
31
|
:and
|
@@ -29,21 +33,33 @@ module AnyStyle
|
|
29
33
|
:etal
|
30
34
|
when /^(pp?|pages?|S(eiten?)?|ff?)$/
|
31
35
|
:page
|
32
|
-
when /^(vol(ume)?s?|iss(ue)?|n[or]?|number|fasc(icle|icule)?)$/i
|
36
|
+
when /^(vol(ume)?s?|iss(ue)?|n[or]?|number|fasc(icle|icule)?|suppl(ement)?)$/i
|
33
37
|
:volume
|
34
38
|
when /^(ser(ies?)?|reihe|[ck]oll(e[ck]tion))$/i
|
35
39
|
:series
|
40
|
+
when /^patent$/i
|
41
|
+
:patent
|
42
|
+
when /^report$/i
|
43
|
+
:report
|
36
44
|
when /^(edn|edition|expanded|rev(ised)?|p?reprint(ed)?|illustrated)$/i,
|
45
|
+
/^editio|aucta$/i
|
37
46
|
/^(aufl(age)?|\p{Alpha}*ausg(abe)?)$/i
|
38
47
|
:edition
|
39
48
|
when /^(nd|date|spring|s[uo]mmer|autumn|fall|winter|frühling|herbst)$/i,
|
40
49
|
/^(jan(uary?)?|feb(ruary?)?|mar(ch|z)?|apr(il)?|ma[yi]|jun[ei]?)$/,
|
41
|
-
/^(jul[yi]?|aug(ust)?|sep(tember)?|o[ck]t(ober)?|nov(ember)?|de[cz](ember)?)$/i
|
50
|
+
/^(jul[yi]?|aug(ust)?|sep(tember)?|o[ck]t(ober)?|nov(ember)?|de[cz](ember)?)$/i,
|
51
|
+
/年/
|
42
52
|
:date
|
43
|
-
when /^(
|
53
|
+
when /^(doi|url)/i
|
44
54
|
:locator
|
45
|
-
when /^(
|
55
|
+
when /^(pmid|pmcid)/i
|
56
|
+
:pubmed
|
57
|
+
when /^(arxiv)/i
|
58
|
+
:arxiv
|
59
|
+
when /^(retrieved|retirado|accessed)$/i
|
46
60
|
:accessed
|
61
|
+
when /^[ILXVMCD]{2,}$/
|
62
|
+
:roman
|
47
63
|
else
|
48
64
|
:none
|
49
65
|
end
|
@@ -32,7 +32,7 @@ module AnyStyle
|
|
32
32
|
:list
|
33
33
|
when /^(\p{Lu}\.?)\s*(\d+\.)+\s+\p{L}+/
|
34
34
|
:title
|
35
|
-
when /^(\w+\s)?(tab(le|elle|\.)|fig(ure|\.))/i
|
35
|
+
when /^(\w+\s)?(tab(le|elle|\.)|fig(ure|\.)|equation|graph|abb(ildung)?)/i
|
36
36
|
:cap
|
37
37
|
when /^\p{Pd}?\d+\p{Pd}?$/, /^[ivx]+$/i
|
38
38
|
:num
|
@@ -3,13 +3,13 @@ module AnyStyle
|
|
3
3
|
class Quotes < Feature
|
4
4
|
def observe(token, **opts)
|
5
5
|
case token
|
6
|
-
when /^[^"'
|
6
|
+
when /^[^"'”„’‚´«「『〈《‘“`»」』〉》]+$/
|
7
7
|
:none
|
8
|
-
when /^["'
|
8
|
+
when /^["'”„’‚´«「『〈《‘“`»].*["'”„’‚´«‘“`»」』〉》][,;:\p{Pd}!\?\.]?$/
|
9
9
|
:'quote-unquote'
|
10
|
-
when /^["'
|
10
|
+
when /^["'”„’‚´«「『‘〈《“`»]/
|
11
11
|
:quote
|
12
|
-
when /["'
|
12
|
+
when /["'”„’‚´«‘“`»」』〉》][,;:\p{Pd}!\?\.]?$/
|
13
13
|
:unquote
|
14
14
|
else
|
15
15
|
:other
|
data/lib/anystyle/finder.rb
CHANGED
@@ -8,7 +8,10 @@ module AnyStyle
|
|
8
8
|
compact: true,
|
9
9
|
threads: 4,
|
10
10
|
format: :references,
|
11
|
-
training_data: Dir[File.join(RES, 'finder', '*.ttx')].map(&:untaint)
|
11
|
+
training_data: Dir[File.join(RES, 'finder', '*.ttx')].map(&:untaint),
|
12
|
+
layout: true,
|
13
|
+
pdftotext: 'pdftotext',
|
14
|
+
pdfinfo: 'pdfinfo'
|
12
15
|
}
|
13
16
|
|
14
17
|
def initialize(options = {})
|
@@ -71,12 +74,18 @@ module AnyStyle
|
|
71
74
|
})
|
72
75
|
end
|
73
76
|
|
74
|
-
def prepare(input,
|
77
|
+
def prepare(input,
|
78
|
+
layout: options[:layout],
|
79
|
+
crop: false,
|
80
|
+
pdftotext: options[:pdftotext],
|
81
|
+
pdfinfo: options[:pdfinfo],
|
82
|
+
**opts)
|
83
|
+
doc_opts = { layout: layout, crop: crop, pdftotext: pdftotext, pdfinfo: pdfinfo, **opts }
|
75
84
|
case input
|
76
85
|
when String
|
77
|
-
super(Document.open(input,
|
86
|
+
super(Document.open(input, **doc_opts), **opts)
|
78
87
|
when Array
|
79
|
-
super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **
|
88
|
+
super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **doc_opts) }), **opts)
|
80
89
|
else
|
81
90
|
super(input, **opts)
|
82
91
|
end
|
data/lib/anystyle/format/csl.rb
CHANGED
@@ -3,6 +3,7 @@ module AnyStyle
|
|
3
3
|
module CSL
|
4
4
|
def format_csl(dataset, **opts)
|
5
5
|
format_hash(dataset).map do |hash|
|
6
|
+
dates_to_citeproc(hash, **opts) if hash.key?(:date)
|
6
7
|
flatten_values hash, skip: Normalizer::Names.keys
|
7
8
|
|
8
9
|
rename_value hash, :pages, :page
|
@@ -23,6 +24,22 @@ module AnyStyle
|
|
23
24
|
end
|
24
25
|
|
25
26
|
alias_method :format_citeproc, :format_csl
|
27
|
+
|
28
|
+
def dates_to_citeproc(hash, date_format: 'edtf', **opts)
|
29
|
+
date, = *hash.delete(:date)
|
30
|
+
|
31
|
+
case date_format.to_s
|
32
|
+
when 'citeproc'
|
33
|
+
hash[:issued] = begin
|
34
|
+
require 'citeproc'
|
35
|
+
::CiteProc::Date.parse!(date.tr('X~', 'u?')).to_citeproc.symbolize_keys
|
36
|
+
rescue
|
37
|
+
date
|
38
|
+
end
|
39
|
+
else
|
40
|
+
hash[:issued] = date
|
41
|
+
end
|
42
|
+
end
|
26
43
|
end
|
27
44
|
end
|
28
45
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
class ArXiv < Normalizer
|
4
|
+
@keys = [:note]
|
5
|
+
|
6
|
+
def normalize(item, **opts)
|
7
|
+
each_value(item) do |_, value|
|
8
|
+
if (value =~ /arxiv:?\s*(\d{4}\.\d+(?:v\d+)?|\w+(?:.\w+)?\/\d+)/i)
|
9
|
+
append item, :arxiv, $1
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -5,7 +5,11 @@ module AnyStyle
|
|
5
5
|
|
6
6
|
def normalize(item, **opts)
|
7
7
|
map_values(item) do |_, value|
|
8
|
-
value
|
8
|
+
value
|
9
|
+
.sub(/^[Ii]n(?::|\s+the)?\s+(\p{^Ll})/, '\1')
|
10
|
+
.sub(/^of\s+/, '')
|
11
|
+
.sub(/^收入/, '')
|
12
|
+
.sub(/^(\w+ )?presented at (the )?/i, '')
|
9
13
|
end
|
10
14
|
end
|
11
15
|
end
|
@@ -3,7 +3,7 @@ module AnyStyle
|
|
3
3
|
|
4
4
|
class Normalizer
|
5
5
|
class Locator < Normalizer
|
6
|
-
@keys = [:isbn, :url]
|
6
|
+
@keys = [:isbn, :url, :doi]
|
7
7
|
|
8
8
|
def normalize(item, **opts)
|
9
9
|
map_values(item) do |key, value|
|
@@ -11,12 +11,20 @@ module AnyStyle
|
|
11
11
|
when :isbn
|
12
12
|
value[/[\d-]+/]
|
13
13
|
when :url
|
14
|
+
doi = doi_extract(value)
|
15
|
+
append item, :doi, doi unless doi.nil?
|
14
16
|
URI.extract(value)
|
17
|
+
when :doi
|
18
|
+
doi_extract(value) || value
|
15
19
|
else
|
16
20
|
value
|
17
21
|
end
|
18
22
|
end
|
19
23
|
end
|
20
24
|
end
|
25
|
+
|
26
|
+
def doi_extract(value)
|
27
|
+
value[/10\.(\d{4,9}\/[-._;()\/:A-Z0-9]+|1002\/\S+)/i]
|
28
|
+
end
|
21
29
|
end
|
22
30
|
end
|
@@ -14,8 +14,10 @@ module AnyStyle
|
|
14
14
|
|
15
15
|
def normalize(item, **opts)
|
16
16
|
each_value(item) do |_, value|
|
17
|
-
value.gsub!(
|
18
|
-
value.gsub!(
|
17
|
+
value.gsub!(/\s*[\)\]\.,:;\p{Pd}\p{Z}\p{C}。、》〉]+$/, '')
|
18
|
+
value.gsub!(/[,:;》〉]+$/, '')
|
19
|
+
value.gsub!(/^[\(\[《〈]/, '')
|
20
|
+
value.gsub!(/<\/?(italic|bold)>/, '')
|
19
21
|
end
|
20
22
|
end
|
21
23
|
end
|
data/lib/anystyle/parser.rb
CHANGED
@@ -100,7 +100,7 @@ module AnyStyle
|
|
100
100
|
compact: true,
|
101
101
|
threads: 4,
|
102
102
|
separator: /(?:\r?\n)+/,
|
103
|
-
delimiter: /\s
|
103
|
+
delimiter: /\s+|([\uFF01-\uFF64]|。|、)/,
|
104
104
|
format: :hash,
|
105
105
|
training_data: File.join(RES, 'parser', 'core.xml')
|
106
106
|
}
|
@@ -139,6 +139,7 @@ module AnyStyle
|
|
139
139
|
Normalizer::Locator.new,
|
140
140
|
Normalizer::Publisher.new,
|
141
141
|
Normalizer::PubMed.new,
|
142
|
+
Normalizer::ArXiv.new,
|
142
143
|
Normalizer::Names.new,
|
143
144
|
Normalizer::Locale.new,
|
144
145
|
Normalizer::Type.new
|
data/lib/anystyle/refs.rb
CHANGED
@@ -90,7 +90,7 @@ module AnyStyle
|
|
90
90
|
indent_score(indent),
|
91
91
|
delta_score(delta),
|
92
92
|
years_score(a, b),
|
93
|
-
terminal_score(a),
|
93
|
+
terminal_score(a, b),
|
94
94
|
initial_score(a, b),
|
95
95
|
length_score(a, b),
|
96
96
|
pages_score(a, b)
|
@@ -100,7 +100,7 @@ module AnyStyle
|
|
100
100
|
|
101
101
|
def indent_score(indent)
|
102
102
|
case
|
103
|
-
when indent > 0 then 1
|
103
|
+
when indent > 0 then 1.25
|
104
104
|
when indent < 0 then -1
|
105
105
|
else
|
106
106
|
0
|
@@ -119,10 +119,19 @@ module AnyStyle
|
|
119
119
|
|
120
120
|
def years_score(a, b)
|
121
121
|
if match_year?(a)
|
122
|
-
if
|
123
|
-
|
122
|
+
if match_year?(b)
|
123
|
+
case
|
124
|
+
when b.length < 18
|
125
|
+
1
|
126
|
+
when b.length < 25
|
127
|
+
0.5
|
128
|
+
when b.length > 60
|
129
|
+
-0.75
|
130
|
+
else
|
131
|
+
0
|
132
|
+
end
|
124
133
|
else
|
125
|
-
if a.match(/[\d,] (1[4-9]|2[01])\d\d[a-z]?\.$/)
|
134
|
+
if a.match(/[\d,] \(?(1[4-9]|2[01])\d\d[a-z]?\)?\.$/)
|
126
135
|
-0.5
|
127
136
|
else
|
128
137
|
1
|
@@ -138,10 +147,10 @@ module AnyStyle
|
|
138
147
|
end
|
139
148
|
|
140
149
|
def pages_score(a, b)
|
141
|
-
if match_pages?(a)
|
150
|
+
if match_pages?(a, true)
|
142
151
|
-0.25
|
143
152
|
else
|
144
|
-
if match_pages?(b)
|
153
|
+
if match_pages?(b, false)
|
145
154
|
1
|
146
155
|
else
|
147
156
|
0
|
@@ -149,23 +158,25 @@ module AnyStyle
|
|
149
158
|
end
|
150
159
|
end
|
151
160
|
|
152
|
-
def match_pages?(string)
|
161
|
+
def match_pages?(string, not_years = true)
|
153
162
|
m = string.match(/(\d+)\p{Pd}(\d+)|\bpp?\.|\d+\(\d+\)/)
|
154
163
|
return false if m.nil?
|
155
|
-
return false if m[1] && match_year?(m[1]) && match_year?(m[2])
|
164
|
+
return false if not_years && m[1] && match_year?(m[1]) && match_year?(m[2])
|
156
165
|
return true
|
157
166
|
end
|
158
167
|
|
159
|
-
def terminal_score(
|
160
|
-
case
|
161
|
-
when /https?:\/\/\w+/i
|
162
|
-
-
|
163
|
-
when /[,;:&\p{Pd}]
|
168
|
+
def terminal_score(a, b)
|
169
|
+
case
|
170
|
+
when a.match(/https?:\/\/\w+/i)
|
171
|
+
-0.25
|
172
|
+
when a.match(/[,;:&\p{Pd}]$/), a.match(/\s(et al|pp|pg)\.$/)
|
164
173
|
2
|
165
|
-
when /\((1[4-9]|2[01])\d\d\)\.?$/
|
174
|
+
when a.match(/\((1[4-9]|2[01])\d\d\)\.?$/)
|
166
175
|
0
|
167
|
-
when /(\p{^Lu}\.|\])$/
|
176
|
+
when a.match(/(\p{^Lu}\.|\])$/)
|
168
177
|
-1
|
178
|
+
when a.match(/\d$/) && b.match(/^\p{Lu}/)
|
179
|
+
-0.25
|
169
180
|
else
|
170
181
|
0
|
171
182
|
end
|
@@ -177,13 +188,15 @@ module AnyStyle
|
|
177
188
|
1.5
|
178
189
|
when a.match(/\p{L}$/) && b.match(/^\p{L}/)
|
179
190
|
1
|
180
|
-
when b.match(/^["'”„’‚´«「『‘“`»]/)
|
191
|
+
when b.match(/^["'”„’‚´«「『‘“`»]/)
|
181
192
|
1
|
193
|
+
when b.match(/^(url|doi|isbn|vol)\b/i)
|
194
|
+
1.5
|
182
195
|
when b.match(/^([\p{Pd}_*][\p{Pd}_* ]+|\p{Co})/)
|
183
196
|
-1.5
|
184
197
|
when b.match(/^\((1[4-9]|2[01])\d\d\)/) && !a.match(/(\p{Lu}|al|others)\.$/)
|
185
198
|
-1
|
186
|
-
when b.match(/^\p{Lu}\p{Ll}
|
199
|
+
when b.match(/^\p{Lu}[\p{Ll}-]+,?\s\p{Lu}/) && !a.match(/\p{L}$/)
|
187
200
|
-0.5
|
188
201
|
when match_list?(b)
|
189
202
|
if match_list?(a)
|
@@ -191,13 +204,15 @@ module AnyStyle
|
|
191
204
|
else
|
192
205
|
-0.75
|
193
206
|
end
|
207
|
+
when b.match(/^\p{L}+:/), b.match(/^\p{L}+ \d/)
|
208
|
+
0.5
|
194
209
|
else
|
195
210
|
0
|
196
211
|
end
|
197
212
|
end
|
198
213
|
|
199
214
|
def match_list?(string)
|
200
|
-
string.match(/^(\d{1,3}\.\s
|
215
|
+
string.match(/^(\d{1,3}(\.\s+|\s{2,})\p{L}|\[\p{Alnum}+\])/)
|
201
216
|
end
|
202
217
|
|
203
218
|
def length_score(a, b)
|
@@ -205,8 +220,14 @@ module AnyStyle
|
|
205
220
|
when b.length < a.length
|
206
221
|
case
|
207
222
|
when b.length < 10
|
223
|
+
2.5
|
224
|
+
when b.length < 15
|
208
225
|
2
|
226
|
+
when b.length < 20
|
227
|
+
1.75
|
209
228
|
when b.length < 25
|
229
|
+
1.5
|
230
|
+
when b.length < 30
|
210
231
|
1
|
211
232
|
when b.length < 50
|
212
233
|
0.75
|
@@ -221,7 +242,7 @@ module AnyStyle
|
|
221
242
|
end
|
222
243
|
|
223
244
|
def join(a, b)
|
224
|
-
if a
|
245
|
+
if a =~ /\p{Pd}$/
|
225
246
|
if a =~ /\p{Ll}-$/ && b =~ /^\p{Ll}/
|
226
247
|
"#{a[0...-1]}#{b}"
|
227
248
|
else
|