anystyle 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: d07c8a520550b0faa3e7a9b3c5b7368fdce46c02
4
- data.tar.gz: 4256158f3ed52621a10e25147bab967c8097578d
2
+ SHA256:
3
+ metadata.gz: 4a8c6471369e8969b190536c6f597742e6917197ce26009b1f7c7dcd1ec32168
4
+ data.tar.gz: 1f9af1e1337c47fda651b40fd19903d7474fe860b679bcafd832b932fc075947
5
5
  SHA512:
6
- metadata.gz: 22c6e55e7b9bbb2cbfa36ef0b1ffc72fe117b2298e4a82b74369a0d9834476fb925222b1a0c6df5ed97cc802db7878a52c837295dc7932a90041544dc944ef5d
7
- data.tar.gz: 07edabc567099d9ecd3bcaf3b0461912eb740f340525dd57d5062edb398db6b285f95f12da83ed762514cb4ebf58cbb8ac13a643d631b8a346e73b46fd182a4a
6
+ metadata.gz: 504a133a3cefedeb24fc9c165e00c9f59f6dfee14b78fa0206b9eadb2d060e236b8e99fe211d94d8df1de9d9c93efc8d3149c931827c5df83bfa56524539ce55
7
+ data.tar.gz: 70a9e1c156fe996980610755971eb4feb899afd2a57e20cbd95c664618326385b88ef8c733268922977a7252b2fff47771ba69892f37eecc94227545c4e956b3
data/HISTORY.md CHANGED
@@ -1,3 +1,11 @@
1
+ 1.3.0 / 2018-09-18
2
+ ==================
3
+ * Updated and improved normalizers and CSL format.
4
+ * Improved Chinese reference tokenization.
5
+ * Added option to customizee pdftotext path.
6
+ * Improved Finder reference line joining.
7
+ * Improved Finder model; training sets.
8
+ * Improved Parser model; training sets.
1
9
  1.2.1 / 2018-08-17
2
10
  ==================
3
11
  * Added check and train commands to CLI.
data/lib/anystyle.rb CHANGED
@@ -28,6 +28,7 @@ require 'anystyle/feature/terminal'
28
28
  require 'anystyle/feature/words'
29
29
 
30
30
  require 'anystyle/normalizer'
31
+ require 'anystyle/normalizer/arxiv'
31
32
  require 'anystyle/normalizer/brackets'
32
33
  require 'anystyle/normalizer/container'
33
34
  require 'anystyle/normalizer/date'
@@ -27,8 +27,8 @@ module AnyStyle
27
27
 
28
28
  case format.downcase
29
29
  when '.pdf'
30
- meta = pdf_meta path if opts[:parse_meta]
31
- info = pdf_info path if opts[:parse_info]
30
+ meta = pdf_meta path, **opts if opts[:parse_meta]
31
+ info = pdf_info path, **opts if opts[:parse_info]
32
32
  input = pdf_to_text path, **opts
33
33
  when '.ttx'
34
34
  tagged = true
@@ -9,11 +9,15 @@ module AnyStyle
9
9
  case alpha
10
10
  when /^ed(s|itors?|ited?|iteurs?)?$/i,
11
11
  /^(hg|hrsg|herausgeber)$/i,
12
- /^(compilador)$/i
12
+ /^(compilador)$/i,
13
+ /編/
13
14
  :editor
15
+ when /著|撰/,
16
+ :author
14
17
  when /^trans(l(ated|ators?|ation))?$/i,
15
18
  /^übers(etz(t|ung))?$/i,
16
- /^trad(uction|ucteurs?|uit)?$/i
19
+ /^trad(uction|ucteurs?|uit)?$/i,
20
+ /譯/
17
21
  :translator
18
22
  when /^(dissertation|thesis)$/i
19
23
  :thesis
@@ -21,7 +25,7 @@ module AnyStyle
21
25
  :proceedings
22
26
  when /^(Journal|Zeitschrift|Quarterly|Magazine?|Times|Rev(iew|vue)?|Bulletin|News|Week|Gazett[ea])/
23
27
  :journal
24
- when /^in$/i
28
+ when /^in$/i, /收入/
25
29
  :in
26
30
  when /^([AaUu]nd|y|e)$/
27
31
  :and
@@ -29,21 +33,33 @@ module AnyStyle
29
33
  :etal
30
34
  when /^(pp?|pages?|S(eiten?)?|ff?)$/
31
35
  :page
32
- when /^(vol(ume)?s?|iss(ue)?|n[or]?|number|fasc(icle|icule)?)$/i
36
+ when /^(vol(ume)?s?|iss(ue)?|n[or]?|number|fasc(icle|icule)?|suppl(ement)?)$/i
33
37
  :volume
34
38
  when /^(ser(ies?)?|reihe|[ck]oll(e[ck]tion))$/i
35
39
  :series
40
+ when /^patent$/i
41
+ :patent
42
+ when /^report$/i
43
+ :report
36
44
  when /^(edn|edition|expanded|rev(ised)?|p?reprint(ed)?|illustrated)$/i,
45
+ /^editio|aucta$/i
37
46
  /^(aufl(age)?|\p{Alpha}*ausg(abe)?)$/i
38
47
  :edition
39
48
  when /^(nd|date|spring|s[uo]mmer|autumn|fall|winter|frühling|herbst)$/i,
40
49
  /^(jan(uary?)?|feb(ruary?)?|mar(ch|z)?|apr(il)?|ma[yi]|jun[ei]?)$/,
41
- /^(jul[yi]?|aug(ust)?|sep(tember)?|o[ck]t(ober)?|nov(ember)?|de[cz](ember)?)$/i
50
+ /^(jul[yi]?|aug(ust)?|sep(tember)?|o[ck]t(ober)?|nov(ember)?|de[cz](ember)?)$/i,
51
+ /年/
42
52
  :date
43
- when /^(pmid|pmcid|arxiv|doi|url)/i
53
+ when /^(doi|url)/i
44
54
  :locator
45
- when /^(retrieved|accessed)$/i
55
+ when /^(pmid|pmcid)/i
56
+ :pubmed
57
+ when /^(arxiv)/i
58
+ :arxiv
59
+ when /^(retrieved|retirado|accessed)$/i
46
60
  :accessed
61
+ when /^[ILXVMCD]{2,}$/
62
+ :roman
47
63
  else
48
64
  :none
49
65
  end
@@ -32,7 +32,7 @@ module AnyStyle
32
32
  :list
33
33
  when /^(\p{Lu}\.?)\s*(\d+\.)+\s+\p{L}+/
34
34
  :title
35
- when /^(\w+\s)?(tab(le|elle|\.)|fig(ure|\.))/i
35
+ when /^(\w+\s)?(tab(le|elle|\.)|fig(ure|\.)|equation|graph|abb(ildung)?)/i
36
36
  :cap
37
37
  when /^\p{Pd}?\d+\p{Pd}?$/, /^[ivx]+$/i
38
38
  :num
@@ -3,13 +3,13 @@ module AnyStyle
3
3
  class Quotes < Feature
4
4
  def observe(token, **opts)
5
5
  case token
6
- when /^[^"'”„’‚´«「『‘“`»」』]+$/
6
+ when /^[^"'”„’‚´«「『〈《‘“`»」』〉》]+$/
7
7
  :none
8
- when /^["'”„’‚´«「『‘“`»].*["'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
8
+ when /^["'”„’‚´«「『〈《‘“`»].*["'”„’‚´«‘“`»」』〉》][,;:\p{Pd}!\?\.]?$/
9
9
  :'quote-unquote'
10
- when /^["'”„’‚´«「『‘“`»]/
10
+ when /^["'”„’‚´«「『‘〈《“`»]/
11
11
  :quote
12
- when /["'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
12
+ when /["'”„’‚´«‘“`»」』〉》][,;:\p{Pd}!\?\.]?$/
13
13
  :unquote
14
14
  else
15
15
  :other
@@ -14,11 +14,15 @@ module AnyStyle
14
14
  contents
15
15
  figures
16
16
  introduction
17
+ kurzfassung
17
18
  literatur
18
19
  literature
19
20
  references
20
21
  referenzen
22
+ secondary
21
23
  section
24
+ sources
25
+ summary
22
26
  tables
23
27
  works
24
28
  }
@@ -8,7 +8,10 @@ module AnyStyle
8
8
  compact: true,
9
9
  threads: 4,
10
10
  format: :references,
11
- training_data: Dir[File.join(RES, 'finder', '*.ttx')].map(&:untaint)
11
+ training_data: Dir[File.join(RES, 'finder', '*.ttx')].map(&:untaint),
12
+ layout: true,
13
+ pdftotext: 'pdftotext',
14
+ pdfinfo: 'pdfinfo'
12
15
  }
13
16
 
14
17
  def initialize(options = {})
@@ -71,12 +74,18 @@ module AnyStyle
71
74
  })
72
75
  end
73
76
 
74
- def prepare(input, layout: true, crop: false, **opts)
77
+ def prepare(input,
78
+ layout: options[:layout],
79
+ crop: false,
80
+ pdftotext: options[:pdftotext],
81
+ pdfinfo: options[:pdfinfo],
82
+ **opts)
83
+ doc_opts = { layout: layout, crop: crop, pdftotext: pdftotext, pdfinfo: pdfinfo, **opts }
75
84
  case input
76
85
  when String
77
- super(Document.open(input, layout: layout, crop: crop, **opts), **opts)
86
+ super(Document.open(input, **doc_opts), **opts)
78
87
  when Array
79
- super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **opts) }), **opts)
88
+ super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **doc_opts) }), **opts)
80
89
  else
81
90
  super(input, **opts)
82
91
  end
@@ -3,6 +3,7 @@ module AnyStyle
3
3
  module CSL
4
4
  def format_csl(dataset, **opts)
5
5
  format_hash(dataset).map do |hash|
6
+ dates_to_citeproc(hash, **opts) if hash.key?(:date)
6
7
  flatten_values hash, skip: Normalizer::Names.keys
7
8
 
8
9
  rename_value hash, :pages, :page
@@ -23,6 +24,22 @@ module AnyStyle
23
24
  end
24
25
 
25
26
  alias_method :format_citeproc, :format_csl
27
+
28
+ def dates_to_citeproc(hash, date_format: 'edtf', **opts)
29
+ date, = *hash.delete(:date)
30
+
31
+ case date_format.to_s
32
+ when 'citeproc'
33
+ hash[:issued] = begin
34
+ require 'citeproc'
35
+ ::CiteProc::Date.parse!(date.tr('X~', 'u?')).to_citeproc.symbolize_keys
36
+ rescue
37
+ date
38
+ end
39
+ else
40
+ hash[:issued] = date
41
+ end
42
+ end
26
43
  end
27
44
  end
28
45
  end
@@ -0,0 +1,15 @@
1
+ module AnyStyle
2
+ class Normalizer
3
+ class ArXiv < Normalizer
4
+ @keys = [:note]
5
+
6
+ def normalize(item, **opts)
7
+ each_value(item) do |_, value|
8
+ if (value =~ /arxiv:?\s*(\d{4}\.\d+(?:v\d+)?|\w+(?:.\w+)?\/\d+)/i)
9
+ append item, :arxiv, $1
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -5,7 +5,11 @@ module AnyStyle
5
5
 
6
6
  def normalize(item, **opts)
7
7
  map_values(item) do |_, value|
8
- value.sub(/^[Ii]n(?::|\s+the)?\s+(\p{^Ll})/, '\1')
8
+ value
9
+ .sub(/^[Ii]n(?::|\s+the)?\s+(\p{^Ll})/, '\1')
10
+ .sub(/^of\s+/, '')
11
+ .sub(/^收入/, '')
12
+ .sub(/^(\w+ )?presented at (the )?/i, '')
9
13
  end
10
14
  end
11
15
  end
@@ -3,7 +3,7 @@ module AnyStyle
3
3
 
4
4
  class Normalizer
5
5
  class Locator < Normalizer
6
- @keys = [:isbn, :url]
6
+ @keys = [:isbn, :url, :doi]
7
7
 
8
8
  def normalize(item, **opts)
9
9
  map_values(item) do |key, value|
@@ -11,12 +11,20 @@ module AnyStyle
11
11
  when :isbn
12
12
  value[/[\d-]+/]
13
13
  when :url
14
+ doi = doi_extract(value)
15
+ append item, :doi, doi unless doi.nil?
14
16
  URI.extract(value)
17
+ when :doi
18
+ doi_extract(value) || value
15
19
  else
16
20
  value
17
21
  end
18
22
  end
19
23
  end
20
24
  end
25
+
26
+ def doi_extract(value)
27
+ value[/10\.(\d{4,9}\/[-._;()\/:A-Z0-9]+|1002\/\S+)/i]
28
+ end
21
29
  end
22
30
  end
@@ -30,7 +30,7 @@ module AnyStyle
30
30
  begin
31
31
  parse(strip(value))
32
32
  rescue
33
- [{ literal: value }]
33
+ [{ literal: value.strip }]
34
34
  end
35
35
  end
36
36
  end
@@ -14,8 +14,10 @@ module AnyStyle
14
14
 
15
15
  def normalize(item, **opts)
16
16
  each_value(item) do |_, value|
17
- value.gsub!(/[\)\]\.,:;\p{Pd}\p{Z}\p{C}]+$/, '')
18
- value.gsub!(/^[\(\[]/, '')
17
+ value.gsub!(/\s*[\)\]\.,:;\p{Pd}\p{Z}\p{C}。、》〉]+$/, '')
18
+ value.gsub!(/[,:;》〉]+$/, '')
19
+ value.gsub!(/^[\(\[《〈]/, '')
20
+ value.gsub!(/<\/?(italic|bold)>/, '')
19
21
  end
20
22
  end
21
23
  end
@@ -100,7 +100,7 @@ module AnyStyle
100
100
  compact: true,
101
101
  threads: 4,
102
102
  separator: /(?:\r?\n)+/,
103
- delimiter: /\s+/,
103
+ delimiter: /\s+|([\uFF01-\uFF64]|。|、)/,
104
104
  format: :hash,
105
105
  training_data: File.join(RES, 'parser', 'core.xml')
106
106
  }
@@ -139,6 +139,7 @@ module AnyStyle
139
139
  Normalizer::Locator.new,
140
140
  Normalizer::Publisher.new,
141
141
  Normalizer::PubMed.new,
142
+ Normalizer::ArXiv.new,
142
143
  Normalizer::Names.new,
143
144
  Normalizer::Locale.new,
144
145
  Normalizer::Type.new
data/lib/anystyle/refs.rb CHANGED
@@ -90,7 +90,7 @@ module AnyStyle
90
90
  indent_score(indent),
91
91
  delta_score(delta),
92
92
  years_score(a, b),
93
- terminal_score(a),
93
+ terminal_score(a, b),
94
94
  initial_score(a, b),
95
95
  length_score(a, b),
96
96
  pages_score(a, b)
@@ -100,7 +100,7 @@ module AnyStyle
100
100
 
101
101
  def indent_score(indent)
102
102
  case
103
- when indent > 0 then 1
103
+ when indent > 0 then 1.25
104
104
  when indent < 0 then -1
105
105
  else
106
106
  0
@@ -119,10 +119,19 @@ module AnyStyle
119
119
 
120
120
  def years_score(a, b)
121
121
  if match_year?(a)
122
- if b.length > 35 && match_year?(b)
123
- -1
122
+ if match_year?(b)
123
+ case
124
+ when b.length < 18
125
+ 1
126
+ when b.length < 25
127
+ 0.5
128
+ when b.length > 60
129
+ -0.75
130
+ else
131
+ 0
132
+ end
124
133
  else
125
- if a.match(/[\d,] (1[4-9]|2[01])\d\d[a-z]?\.$/)
134
+ if a.match(/[\d,] \(?(1[4-9]|2[01])\d\d[a-z]?\)?\.$/)
126
135
  -0.5
127
136
  else
128
137
  1
@@ -138,10 +147,10 @@ module AnyStyle
138
147
  end
139
148
 
140
149
  def pages_score(a, b)
141
- if match_pages?(a)
150
+ if match_pages?(a, true)
142
151
  -0.25
143
152
  else
144
- if match_pages?(b)
153
+ if match_pages?(b, false)
145
154
  1
146
155
  else
147
156
  0
@@ -149,23 +158,25 @@ module AnyStyle
149
158
  end
150
159
  end
151
160
 
152
- def match_pages?(string)
161
+ def match_pages?(string, not_years = true)
153
162
  m = string.match(/(\d+)\p{Pd}(\d+)|\bpp?\.|\d+\(\d+\)/)
154
163
  return false if m.nil?
155
- return false if m[1] && match_year?(m[1]) && match_year?(m[2])
164
+ return false if not_years && m[1] && match_year?(m[1]) && match_year?(m[2])
156
165
  return true
157
166
  end
158
167
 
159
- def terminal_score(string)
160
- case string
161
- when /https?:\/\/\w+/i
162
- -1
163
- when /[,;:&\p{Pd}]$/, /(et al|pp)\.$/
168
+ def terminal_score(a, b)
169
+ case
170
+ when a.match(/https?:\/\/\w+/i)
171
+ -0.25
172
+ when a.match(/[,;:&\p{Pd}]$/), a.match(/\s(et al|pp|pg)\.$/)
164
173
  2
165
- when /\((1[4-9]|2[01])\d\d\)\.?$/
174
+ when a.match(/\((1[4-9]|2[01])\d\d\)\.?$/)
166
175
  0
167
- when /(\p{^Lu}\.|\])$/
176
+ when a.match(/(\p{^Lu}\.|\])$/)
168
177
  -1
178
+ when a.match(/\d$/) && b.match(/^\p{Lu}/)
179
+ -0.25
169
180
  else
170
181
  0
171
182
  end
@@ -177,13 +188,15 @@ module AnyStyle
177
188
  1.5
178
189
  when a.match(/\p{L}$/) && b.match(/^\p{L}/)
179
190
  1
180
- when b.match(/^["'”„’‚´«「『‘“`»]/), b.match(/^url|http/i)
191
+ when b.match(/^["'”„’‚´«「『‘“`»]/)
181
192
  1
193
+ when b.match(/^(url|doi|isbn|vol)\b/i)
194
+ 1.5
182
195
  when b.match(/^([\p{Pd}_*][\p{Pd}_* ]+|\p{Co})/)
183
196
  -1.5
184
197
  when b.match(/^\((1[4-9]|2[01])\d\d\)/) && !a.match(/(\p{Lu}|al|others)\.$/)
185
198
  -1
186
- when b.match(/^\p{Lu}\p{Ll}+,\s\p{L}/) && !a.match(/\p{L}$/)
199
+ when b.match(/^\p{Lu}[\p{Ll}-]+,?\s\p{Lu}/) && !a.match(/\p{L}$/)
187
200
  -0.5
188
201
  when match_list?(b)
189
202
  if match_list?(a)
@@ -191,13 +204,15 @@ module AnyStyle
191
204
  else
192
205
  -0.75
193
206
  end
207
+ when b.match(/^\p{L}+:/), b.match(/^\p{L}+ \d/)
208
+ 0.5
194
209
  else
195
210
  0
196
211
  end
197
212
  end
198
213
 
199
214
  def match_list?(string)
200
- string.match(/^(\d{1,3}\.\s+\p{L}|\[\p{Alnum}+\])/)
215
+ string.match(/^(\d{1,3}(\.\s+|\s{2,})\p{L}|\[\p{Alnum}+\])/)
201
216
  end
202
217
 
203
218
  def length_score(a, b)
@@ -205,8 +220,14 @@ module AnyStyle
205
220
  when b.length < a.length
206
221
  case
207
222
  when b.length < 10
223
+ 2.5
224
+ when b.length < 15
208
225
  2
226
+ when b.length < 20
227
+ 1.75
209
228
  when b.length < 25
229
+ 1.5
230
+ when b.length < 30
210
231
  1
211
232
  when b.length < 50
212
233
  0.75
@@ -221,7 +242,7 @@ module AnyStyle
221
242
  end
222
243
 
223
244
  def join(a, b)
224
- if a.end_with? '-'
245
+ if a =~ /\p{Pd}$/
225
246
  if a =~ /\p{Ll}-$/ && b =~ /^\p{Ll}/
226
247
  "#{a[0...-1]}#{b}"
227
248
  else