anystyle 1.2.1 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: d07c8a520550b0faa3e7a9b3c5b7368fdce46c02
4
- data.tar.gz: 4256158f3ed52621a10e25147bab967c8097578d
2
+ SHA256:
3
+ metadata.gz: 4a8c6471369e8969b190536c6f597742e6917197ce26009b1f7c7dcd1ec32168
4
+ data.tar.gz: 1f9af1e1337c47fda651b40fd19903d7474fe860b679bcafd832b932fc075947
5
5
  SHA512:
6
- metadata.gz: 22c6e55e7b9bbb2cbfa36ef0b1ffc72fe117b2298e4a82b74369a0d9834476fb925222b1a0c6df5ed97cc802db7878a52c837295dc7932a90041544dc944ef5d
7
- data.tar.gz: 07edabc567099d9ecd3bcaf3b0461912eb740f340525dd57d5062edb398db6b285f95f12da83ed762514cb4ebf58cbb8ac13a643d631b8a346e73b46fd182a4a
6
+ metadata.gz: 504a133a3cefedeb24fc9c165e00c9f59f6dfee14b78fa0206b9eadb2d060e236b8e99fe211d94d8df1de9d9c93efc8d3149c931827c5df83bfa56524539ce55
7
+ data.tar.gz: 70a9e1c156fe996980610755971eb4feb899afd2a57e20cbd95c664618326385b88ef8c733268922977a7252b2fff47771ba69892f37eecc94227545c4e956b3
data/HISTORY.md CHANGED
@@ -1,3 +1,11 @@
1
+ 1.3.0 / 2018-09-18
2
+ ==================
3
+ * Updated and improved normalizers and CSL format.
4
+ * Improved Chinese reference tokenization.
5
+ * Added option to customizee pdftotext path.
6
+ * Improved Finder reference line joining.
7
+ * Improved Finder model; training sets.
8
+ * Improved Parser model; training sets.
1
9
  1.2.1 / 2018-08-17
2
10
  ==================
3
11
  * Added check and train commands to CLI.
data/lib/anystyle.rb CHANGED
@@ -28,6 +28,7 @@ require 'anystyle/feature/terminal'
28
28
  require 'anystyle/feature/words'
29
29
 
30
30
  require 'anystyle/normalizer'
31
+ require 'anystyle/normalizer/arxiv'
31
32
  require 'anystyle/normalizer/brackets'
32
33
  require 'anystyle/normalizer/container'
33
34
  require 'anystyle/normalizer/date'
@@ -27,8 +27,8 @@ module AnyStyle
27
27
 
28
28
  case format.downcase
29
29
  when '.pdf'
30
- meta = pdf_meta path if opts[:parse_meta]
31
- info = pdf_info path if opts[:parse_info]
30
+ meta = pdf_meta path, **opts if opts[:parse_meta]
31
+ info = pdf_info path, **opts if opts[:parse_info]
32
32
  input = pdf_to_text path, **opts
33
33
  when '.ttx'
34
34
  tagged = true
@@ -9,11 +9,15 @@ module AnyStyle
9
9
  case alpha
10
10
  when /^ed(s|itors?|ited?|iteurs?)?$/i,
11
11
  /^(hg|hrsg|herausgeber)$/i,
12
- /^(compilador)$/i
12
+ /^(compilador)$/i,
13
+ /編/
13
14
  :editor
15
+ when /著|撰/,
16
+ :author
14
17
  when /^trans(l(ated|ators?|ation))?$/i,
15
18
  /^übers(etz(t|ung))?$/i,
16
- /^trad(uction|ucteurs?|uit)?$/i
19
+ /^trad(uction|ucteurs?|uit)?$/i,
20
+ /譯/
17
21
  :translator
18
22
  when /^(dissertation|thesis)$/i
19
23
  :thesis
@@ -21,7 +25,7 @@ module AnyStyle
21
25
  :proceedings
22
26
  when /^(Journal|Zeitschrift|Quarterly|Magazine?|Times|Rev(iew|vue)?|Bulletin|News|Week|Gazett[ea])/
23
27
  :journal
24
- when /^in$/i
28
+ when /^in$/i, /收入/
25
29
  :in
26
30
  when /^([AaUu]nd|y|e)$/
27
31
  :and
@@ -29,21 +33,33 @@ module AnyStyle
29
33
  :etal
30
34
  when /^(pp?|pages?|S(eiten?)?|ff?)$/
31
35
  :page
32
- when /^(vol(ume)?s?|iss(ue)?|n[or]?|number|fasc(icle|icule)?)$/i
36
+ when /^(vol(ume)?s?|iss(ue)?|n[or]?|number|fasc(icle|icule)?|suppl(ement)?)$/i
33
37
  :volume
34
38
  when /^(ser(ies?)?|reihe|[ck]oll(e[ck]tion))$/i
35
39
  :series
40
+ when /^patent$/i
41
+ :patent
42
+ when /^report$/i
43
+ :report
36
44
  when /^(edn|edition|expanded|rev(ised)?|p?reprint(ed)?|illustrated)$/i,
45
+ /^editio|aucta$/i
37
46
  /^(aufl(age)?|\p{Alpha}*ausg(abe)?)$/i
38
47
  :edition
39
48
  when /^(nd|date|spring|s[uo]mmer|autumn|fall|winter|frühling|herbst)$/i,
40
49
  /^(jan(uary?)?|feb(ruary?)?|mar(ch|z)?|apr(il)?|ma[yi]|jun[ei]?)$/,
41
- /^(jul[yi]?|aug(ust)?|sep(tember)?|o[ck]t(ober)?|nov(ember)?|de[cz](ember)?)$/i
50
+ /^(jul[yi]?|aug(ust)?|sep(tember)?|o[ck]t(ober)?|nov(ember)?|de[cz](ember)?)$/i,
51
+ /年/
42
52
  :date
43
- when /^(pmid|pmcid|arxiv|doi|url)/i
53
+ when /^(doi|url)/i
44
54
  :locator
45
- when /^(retrieved|accessed)$/i
55
+ when /^(pmid|pmcid)/i
56
+ :pubmed
57
+ when /^(arxiv)/i
58
+ :arxiv
59
+ when /^(retrieved|retirado|accessed)$/i
46
60
  :accessed
61
+ when /^[ILXVMCD]{2,}$/
62
+ :roman
47
63
  else
48
64
  :none
49
65
  end
@@ -32,7 +32,7 @@ module AnyStyle
32
32
  :list
33
33
  when /^(\p{Lu}\.?)\s*(\d+\.)+\s+\p{L}+/
34
34
  :title
35
- when /^(\w+\s)?(tab(le|elle|\.)|fig(ure|\.))/i
35
+ when /^(\w+\s)?(tab(le|elle|\.)|fig(ure|\.)|equation|graph|abb(ildung)?)/i
36
36
  :cap
37
37
  when /^\p{Pd}?\d+\p{Pd}?$/, /^[ivx]+$/i
38
38
  :num
@@ -3,13 +3,13 @@ module AnyStyle
3
3
  class Quotes < Feature
4
4
  def observe(token, **opts)
5
5
  case token
6
- when /^[^"'”„’‚´«「『‘“`»」』]+$/
6
+ when /^[^"'”„’‚´«「『〈《‘“`»」』〉》]+$/
7
7
  :none
8
- when /^["'”„’‚´«「『‘“`»].*["'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
8
+ when /^["'”„’‚´«「『〈《‘“`»].*["'”„’‚´«‘“`»」』〉》][,;:\p{Pd}!\?\.]?$/
9
9
  :'quote-unquote'
10
- when /^["'”„’‚´«「『‘“`»]/
10
+ when /^["'”„’‚´«「『‘〈《“`»]/
11
11
  :quote
12
- when /["'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
12
+ when /["'”„’‚´«‘“`»」』〉》][,;:\p{Pd}!\?\.]?$/
13
13
  :unquote
14
14
  else
15
15
  :other
@@ -14,11 +14,15 @@ module AnyStyle
14
14
  contents
15
15
  figures
16
16
  introduction
17
+ kurzfassung
17
18
  literatur
18
19
  literature
19
20
  references
20
21
  referenzen
22
+ secondary
21
23
  section
24
+ sources
25
+ summary
22
26
  tables
23
27
  works
24
28
  }
@@ -8,7 +8,10 @@ module AnyStyle
8
8
  compact: true,
9
9
  threads: 4,
10
10
  format: :references,
11
- training_data: Dir[File.join(RES, 'finder', '*.ttx')].map(&:untaint)
11
+ training_data: Dir[File.join(RES, 'finder', '*.ttx')].map(&:untaint),
12
+ layout: true,
13
+ pdftotext: 'pdftotext',
14
+ pdfinfo: 'pdfinfo'
12
15
  }
13
16
 
14
17
  def initialize(options = {})
@@ -71,12 +74,18 @@ module AnyStyle
71
74
  })
72
75
  end
73
76
 
74
- def prepare(input, layout: true, crop: false, **opts)
77
+ def prepare(input,
78
+ layout: options[:layout],
79
+ crop: false,
80
+ pdftotext: options[:pdftotext],
81
+ pdfinfo: options[:pdfinfo],
82
+ **opts)
83
+ doc_opts = { layout: layout, crop: crop, pdftotext: pdftotext, pdfinfo: pdfinfo, **opts }
75
84
  case input
76
85
  when String
77
- super(Document.open(input, layout: layout, crop: crop, **opts), **opts)
86
+ super(Document.open(input, **doc_opts), **opts)
78
87
  when Array
79
- super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **opts) }), **opts)
88
+ super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **doc_opts) }), **opts)
80
89
  else
81
90
  super(input, **opts)
82
91
  end
@@ -3,6 +3,7 @@ module AnyStyle
3
3
  module CSL
4
4
  def format_csl(dataset, **opts)
5
5
  format_hash(dataset).map do |hash|
6
+ dates_to_citeproc(hash, **opts) if hash.key?(:date)
6
7
  flatten_values hash, skip: Normalizer::Names.keys
7
8
 
8
9
  rename_value hash, :pages, :page
@@ -23,6 +24,22 @@ module AnyStyle
23
24
  end
24
25
 
25
26
  alias_method :format_citeproc, :format_csl
27
+
28
+ def dates_to_citeproc(hash, date_format: 'edtf', **opts)
29
+ date, = *hash.delete(:date)
30
+
31
+ case date_format.to_s
32
+ when 'citeproc'
33
+ hash[:issued] = begin
34
+ require 'citeproc'
35
+ ::CiteProc::Date.parse!(date.tr('X~', 'u?')).to_citeproc.symbolize_keys
36
+ rescue
37
+ date
38
+ end
39
+ else
40
+ hash[:issued] = date
41
+ end
42
+ end
26
43
  end
27
44
  end
28
45
  end
@@ -0,0 +1,15 @@
1
+ module AnyStyle
2
+ class Normalizer
3
+ class ArXiv < Normalizer
4
+ @keys = [:note]
5
+
6
+ def normalize(item, **opts)
7
+ each_value(item) do |_, value|
8
+ if (value =~ /arxiv:?\s*(\d{4}\.\d+(?:v\d+)?|\w+(?:.\w+)?\/\d+)/i)
9
+ append item, :arxiv, $1
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -5,7 +5,11 @@ module AnyStyle
5
5
 
6
6
  def normalize(item, **opts)
7
7
  map_values(item) do |_, value|
8
- value.sub(/^[Ii]n(?::|\s+the)?\s+(\p{^Ll})/, '\1')
8
+ value
9
+ .sub(/^[Ii]n(?::|\s+the)?\s+(\p{^Ll})/, '\1')
10
+ .sub(/^of\s+/, '')
11
+ .sub(/^收入/, '')
12
+ .sub(/^(\w+ )?presented at (the )?/i, '')
9
13
  end
10
14
  end
11
15
  end
@@ -3,7 +3,7 @@ module AnyStyle
3
3
 
4
4
  class Normalizer
5
5
  class Locator < Normalizer
6
- @keys = [:isbn, :url]
6
+ @keys = [:isbn, :url, :doi]
7
7
 
8
8
  def normalize(item, **opts)
9
9
  map_values(item) do |key, value|
@@ -11,12 +11,20 @@ module AnyStyle
11
11
  when :isbn
12
12
  value[/[\d-]+/]
13
13
  when :url
14
+ doi = doi_extract(value)
15
+ append item, :doi, doi unless doi.nil?
14
16
  URI.extract(value)
17
+ when :doi
18
+ doi_extract(value) || value
15
19
  else
16
20
  value
17
21
  end
18
22
  end
19
23
  end
20
24
  end
25
+
26
+ def doi_extract(value)
27
+ value[/10\.(\d{4,9}\/[-._;()\/:A-Z0-9]+|1002\/\S+)/i]
28
+ end
21
29
  end
22
30
  end
@@ -30,7 +30,7 @@ module AnyStyle
30
30
  begin
31
31
  parse(strip(value))
32
32
  rescue
33
- [{ literal: value }]
33
+ [{ literal: value.strip }]
34
34
  end
35
35
  end
36
36
  end
@@ -14,8 +14,10 @@ module AnyStyle
14
14
 
15
15
  def normalize(item, **opts)
16
16
  each_value(item) do |_, value|
17
- value.gsub!(/[\)\]\.,:;\p{Pd}\p{Z}\p{C}]+$/, '')
18
- value.gsub!(/^[\(\[]/, '')
17
+ value.gsub!(/\s*[\)\]\.,:;\p{Pd}\p{Z}\p{C}。、》〉]+$/, '')
18
+ value.gsub!(/[,:;》〉]+$/, '')
19
+ value.gsub!(/^[\(\[《〈]/, '')
20
+ value.gsub!(/<\/?(italic|bold)>/, '')
19
21
  end
20
22
  end
21
23
  end
@@ -100,7 +100,7 @@ module AnyStyle
100
100
  compact: true,
101
101
  threads: 4,
102
102
  separator: /(?:\r?\n)+/,
103
- delimiter: /\s+/,
103
+ delimiter: /\s+|([\uFF01-\uFF64]|。|、)/,
104
104
  format: :hash,
105
105
  training_data: File.join(RES, 'parser', 'core.xml')
106
106
  }
@@ -139,6 +139,7 @@ module AnyStyle
139
139
  Normalizer::Locator.new,
140
140
  Normalizer::Publisher.new,
141
141
  Normalizer::PubMed.new,
142
+ Normalizer::ArXiv.new,
142
143
  Normalizer::Names.new,
143
144
  Normalizer::Locale.new,
144
145
  Normalizer::Type.new
data/lib/anystyle/refs.rb CHANGED
@@ -90,7 +90,7 @@ module AnyStyle
90
90
  indent_score(indent),
91
91
  delta_score(delta),
92
92
  years_score(a, b),
93
- terminal_score(a),
93
+ terminal_score(a, b),
94
94
  initial_score(a, b),
95
95
  length_score(a, b),
96
96
  pages_score(a, b)
@@ -100,7 +100,7 @@ module AnyStyle
100
100
 
101
101
  def indent_score(indent)
102
102
  case
103
- when indent > 0 then 1
103
+ when indent > 0 then 1.25
104
104
  when indent < 0 then -1
105
105
  else
106
106
  0
@@ -119,10 +119,19 @@ module AnyStyle
119
119
 
120
120
  def years_score(a, b)
121
121
  if match_year?(a)
122
- if b.length > 35 && match_year?(b)
123
- -1
122
+ if match_year?(b)
123
+ case
124
+ when b.length < 18
125
+ 1
126
+ when b.length < 25
127
+ 0.5
128
+ when b.length > 60
129
+ -0.75
130
+ else
131
+ 0
132
+ end
124
133
  else
125
- if a.match(/[\d,] (1[4-9]|2[01])\d\d[a-z]?\.$/)
134
+ if a.match(/[\d,] \(?(1[4-9]|2[01])\d\d[a-z]?\)?\.$/)
126
135
  -0.5
127
136
  else
128
137
  1
@@ -138,10 +147,10 @@ module AnyStyle
138
147
  end
139
148
 
140
149
  def pages_score(a, b)
141
- if match_pages?(a)
150
+ if match_pages?(a, true)
142
151
  -0.25
143
152
  else
144
- if match_pages?(b)
153
+ if match_pages?(b, false)
145
154
  1
146
155
  else
147
156
  0
@@ -149,23 +158,25 @@ module AnyStyle
149
158
  end
150
159
  end
151
160
 
152
- def match_pages?(string)
161
+ def match_pages?(string, not_years = true)
153
162
  m = string.match(/(\d+)\p{Pd}(\d+)|\bpp?\.|\d+\(\d+\)/)
154
163
  return false if m.nil?
155
- return false if m[1] && match_year?(m[1]) && match_year?(m[2])
164
+ return false if not_years && m[1] && match_year?(m[1]) && match_year?(m[2])
156
165
  return true
157
166
  end
158
167
 
159
- def terminal_score(string)
160
- case string
161
- when /https?:\/\/\w+/i
162
- -1
163
- when /[,;:&\p{Pd}]$/, /(et al|pp)\.$/
168
+ def terminal_score(a, b)
169
+ case
170
+ when a.match(/https?:\/\/\w+/i)
171
+ -0.25
172
+ when a.match(/[,;:&\p{Pd}]$/), a.match(/\s(et al|pp|pg)\.$/)
164
173
  2
165
- when /\((1[4-9]|2[01])\d\d\)\.?$/
174
+ when a.match(/\((1[4-9]|2[01])\d\d\)\.?$/)
166
175
  0
167
- when /(\p{^Lu}\.|\])$/
176
+ when a.match(/(\p{^Lu}\.|\])$/)
168
177
  -1
178
+ when a.match(/\d$/) && b.match(/^\p{Lu}/)
179
+ -0.25
169
180
  else
170
181
  0
171
182
  end
@@ -177,13 +188,15 @@ module AnyStyle
177
188
  1.5
178
189
  when a.match(/\p{L}$/) && b.match(/^\p{L}/)
179
190
  1
180
- when b.match(/^["'”„’‚´«「『‘“`»]/), b.match(/^url|http/i)
191
+ when b.match(/^["'”„’‚´«「『‘“`»]/)
181
192
  1
193
+ when b.match(/^(url|doi|isbn|vol)\b/i)
194
+ 1.5
182
195
  when b.match(/^([\p{Pd}_*][\p{Pd}_* ]+|\p{Co})/)
183
196
  -1.5
184
197
  when b.match(/^\((1[4-9]|2[01])\d\d\)/) && !a.match(/(\p{Lu}|al|others)\.$/)
185
198
  -1
186
- when b.match(/^\p{Lu}\p{Ll}+,\s\p{L}/) && !a.match(/\p{L}$/)
199
+ when b.match(/^\p{Lu}[\p{Ll}-]+,?\s\p{Lu}/) && !a.match(/\p{L}$/)
187
200
  -0.5
188
201
  when match_list?(b)
189
202
  if match_list?(a)
@@ -191,13 +204,15 @@ module AnyStyle
191
204
  else
192
205
  -0.75
193
206
  end
207
+ when b.match(/^\p{L}+:/), b.match(/^\p{L}+ \d/)
208
+ 0.5
194
209
  else
195
210
  0
196
211
  end
197
212
  end
198
213
 
199
214
  def match_list?(string)
200
- string.match(/^(\d{1,3}\.\s+\p{L}|\[\p{Alnum}+\])/)
215
+ string.match(/^(\d{1,3}(\.\s+|\s{2,})\p{L}|\[\p{Alnum}+\])/)
201
216
  end
202
217
 
203
218
  def length_score(a, b)
@@ -205,8 +220,14 @@ module AnyStyle
205
220
  when b.length < a.length
206
221
  case
207
222
  when b.length < 10
223
+ 2.5
224
+ when b.length < 15
208
225
  2
226
+ when b.length < 20
227
+ 1.75
209
228
  when b.length < 25
229
+ 1.5
230
+ when b.length < 30
210
231
  1
211
232
  when b.length < 50
212
233
  0.75
@@ -221,7 +242,7 @@ module AnyStyle
221
242
  end
222
243
 
223
244
  def join(a, b)
224
- if a.end_with? '-'
245
+ if a =~ /\p{Pd}$/
225
246
  if a =~ /\p{Ll}-$/ && b =~ /^\p{Ll}/
226
247
  "#{a[0...-1]}#{b}"
227
248
  else