anystyle 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: 411233e757de4873a040de0adac8a862e9a8d77fe8844128ea5279f774608f45
4
- data.tar.gz: '09133619934b406c21a75447d19c7b81d672bcd61e19d6f2bb0064e6b8bda718'
2
+ SHA1:
3
+ metadata.gz: 3b74980250ee2a9627b97a9f91e6c11052ddd1e0
4
+ data.tar.gz: 645f6371761078aa5d4b49c0905a749eda55dd3b
5
5
  SHA512:
6
- metadata.gz: 164b640e6b8c79a4d19ff82da141acf6d75cf133cff56624440ca63a5803a4ba457d65090bde7cb1074b503390e99430a7aaf911bd647b0484a9d982ea6fbdb2
7
- data.tar.gz: ae83ce33900704257bc7ac2fc21d278125098821bcf6577bec868779a009e96743f3623f41078255f4a6b56c6b826e5899159dc2564af0f41c65035ae238af1b
6
+ metadata.gz: 449c2ffca67851254e4e9a98fa5618a57928147e3fda613b908ab83bc8d4d096cec7f11bae9f7727fbcad69dfac73d6f4f1034f00cece3c30a7813bfc7c2c20e
7
+ data.tar.gz: c14d4637d11689eb88727baabcdc30c672642d93dad810c15b86e2197835c75360c21ccf03bbdf9da3e909e0f0777d55f9ef81218700231db8ea5504051f8699
data/HISTORY.md CHANGED
@@ -1,3 +1,12 @@
1
+ 1.2.0 / 2018-08-16
2
+ ==================
3
+ * Added check and train commands to CLI.
4
+ * Added --no-solo and --crop flags to find command.
5
+ * Added reference block normalizer.
6
+ * Added script detection normalizer.
7
+ * Improved Finder reference line joining.
8
+ * Improved Finder model; training sets.
9
+
1
10
  1.1.0 / 2018-07-11
2
11
  ==================
3
12
  * Improved Parser model; training sets.
@@ -49,6 +49,8 @@ require 'anystyle/normalizer/volume'
49
49
  require 'anystyle/format/bibtex'
50
50
  require 'anystyle/format/csl'
51
51
 
52
+ require 'anystyle/page'
53
+ require 'anystyle/refs'
52
54
  require 'anystyle/document'
53
55
  require 'anystyle/parser'
54
56
  require 'anystyle/finder'
@@ -1,7 +1,10 @@
1
1
  module AnyStyle
2
2
  class Document < Wapiti::Sequence
3
+
4
+ REFSECT = /references|referenzen|cited|bibliogra|secondary sources|literatur/i
5
+
3
6
  class << self
4
- include PdfUtils
7
+ include PDFUtils
5
8
 
6
9
  def parse(string, delimiter: /\r?\n/, tagged: false)
7
10
  current_label = ''
@@ -14,7 +17,7 @@ module AnyStyle
14
17
  })
15
18
  end
16
19
 
17
- def open(path, format: File.extname(path), tagged: false, layout: true, **opts)
20
+ def open(path, format: File.extname(path), tagged: false, **opts)
18
21
  raise ArgumentError,
19
22
  "cannot open tainted path: '#{path}'" if path.tainted?
20
23
  raise ArgumentError,
@@ -26,7 +29,7 @@ module AnyStyle
26
29
  when '.pdf'
27
30
  meta = pdf_meta path if opts[:parse_meta]
28
31
  info = pdf_info path if opts[:parse_info]
29
- input = pdf_to_text path, layout: layout
32
+ input = pdf_to_text path, **opts
30
33
  when '.ttx'
31
34
  tagged = true
32
35
  input = File.read(path, encoding: 'utf-8')
@@ -47,8 +50,16 @@ module AnyStyle
47
50
  attr_accessor :meta, :info, :path, :pages, :tokens
48
51
  alias_method :lines, :tokens
49
52
 
53
+ def line_counts
54
+ @line_counts ||= Hash.new(0)
55
+ end
56
+
57
+ def nnum_counts
58
+ @nnum_counts ||= Hash.new(0)
59
+ end
60
+
50
61
  def pages
51
- @pages ||= Page.parse(lines)
62
+ @pages ||= Page.parse(lines, self)
52
63
  end
53
64
 
54
65
  def each
@@ -64,28 +75,34 @@ module AnyStyle
64
75
  end
65
76
  end
66
77
 
67
- def each_section
78
+ def each_section(skip: ['meta'])
68
79
  if block_given?
69
- current = []
80
+ head = []
81
+ body = []
82
+ seen_content = false
83
+
70
84
  lines.each do |ln|
71
85
  case ln.label
72
86
  when 'title'
73
- unless current.empty?
74
- yield current
75
- current = []
87
+ if seen_content
88
+ yield [head, body]
89
+ head, body, seen_content = [ln], [], false
90
+ else
91
+ head << ln
76
92
  end
77
93
  when 'ref', 'text'
78
- current << ln
94
+ body << ln
95
+ seen_content = true
79
96
  else
80
- # ignore
97
+ body << ln unless skip.include?(ln.label)
81
98
  end
82
99
  end
83
- unless current.empty?
84
- yield current
100
+ unless head.empty?
101
+ yield [head, body]
85
102
  end
86
103
  self
87
104
  else
88
- to_enum
105
+ to_enum :each_section
89
106
  end
90
107
  end
91
108
 
@@ -94,7 +111,8 @@ module AnyStyle
94
111
  doc.tokens = lines.map.with_index { |line, idx|
95
112
  Wapiti::Token.new line.value,
96
113
  label: other[idx].label.to_s,
97
- observations: other[idx].observations.dup
114
+ observations: other[idx].observations.dup,
115
+ score: other[idx].score
98
116
  }
99
117
  doc
100
118
  end
@@ -126,83 +144,45 @@ module AnyStyle
126
144
  }
127
145
  end
128
146
 
129
- def references(**opts)
130
- bib, current, delta, indent = [], nil, 0, 0
131
-
132
- lines.each do |ln|
133
- case ln.label
134
- when 'ref'
135
- val = display_chars(ln.value).rstrip
136
- idt = val[/^\s*/].length
137
- val.lstrip!
138
-
139
- if current.nil?
140
- current, delta, indent = val, 0, idt
141
- else
142
- if join_refs?(current, val, delta, idt - indent)
143
- current = join_refs(current, val)
144
- else
145
- bib << current
146
- current, delta, indent = val, 0, idt
147
- end
148
- end
149
- else
150
- unless current.nil?
151
- if delta > 15 || %w{ blank meta }.include?(ln.label)
152
- delta += 1
153
- else
154
- bib << current
155
- current, delta, indent = nil, 0, idt
147
+ def references(normalize_blocks: false, **opts)
148
+ if normalize_blocks
149
+ each_section.inject([]) do |refs, (head, body)|
150
+ rc = body.count { |tk| tk.label == 'ref' }
151
+ unless rc == 0
152
+ tc = body.count { |tk| tk.label == 'text' }
153
+ is_ref_sect = !head.find { |tk| tk.value =~ REFSECT }.nil?
154
+
155
+ # Skip sections with few ref lines!
156
+ if is_ref_sect || include_references?(rc, tc)
157
+ Refs.normalize! body, max_win_size: is_ref_sect ? 6 : 2
158
+ refs.concat Refs.parse(body).to_a
156
159
  end
157
160
  end
158
- end
159
- end
160
161
 
161
- unless current.nil?
162
- bib << current
162
+ refs
163
+ end
164
+ else
165
+ Refs.parse(lines).to_a
163
166
  end
164
-
165
- bib
166
167
  end
167
168
 
168
- def join_refs?(a, b, delta = 0, indent = 0)
169
- pro = [
170
- indent > 0,
171
- delta == 0,
172
- b.length < 50,
173
- a.length < 65,
174
- !!a.match(/[,;:&\p{Pd}]$/),
175
- !!b.match(/^\p{Ll}/) || !!a.match(/\p{L}$/) && !!b.match(/^\p{L}/)
176
- ].count(true)
177
-
178
- con = [
179
- indent < 0,
180
- delta > 8,
181
- !!a.match(/\.\]$/),
182
- a.length > 500,
183
- (b.length - a.length) > 12,
184
- !!b.match(/^(\p{Pd}\p{Pd}|\p{Lu}\p{Ll}+, \p{Lu}\.|\[\d)/)
185
- ].count(true)
186
-
187
- (pro - con) > 1
169
+ def include_references?(rc, tc)
170
+ rc > 10 || (rc + tc) > 20 && (rc.to_f / tc) > 0.2
188
171
  end
189
172
 
190
- def join_refs(a, b)
191
- if a[-1] == '-'
192
- if b =~ /^\p{Ll}/
193
- "#{a[0...-1]}#{b}"
194
- else
195
- "#{a}#{b}"
196
- end
197
- else
198
- "#{a} #{b}"
173
+ def sections(delimiter: "\n", spacer: ' ', **opts)
174
+ each_section.map do |(head, body)|
175
+ {
176
+ title: head.map { |tk|
177
+ display_chars(tk.value).lstrip.unicode_normalize
178
+ }.join(spacer),
179
+ text: body.map { |tk|
180
+ display_chars(tk.value).unicode_normalize
181
+ }.join(delimiter)
182
+ }
199
183
  end
200
184
  end
201
185
 
202
- def sections(delimiter: "\n", **opts)
203
- []
204
- end
205
-
206
186
  def title(delimiter: " ", **opts)
207
187
  lines.drop_while { |ln|
208
188
  ln.label != 'title'
@@ -214,51 +194,5 @@ module AnyStyle
214
194
  def inspect
215
195
  "#<AnyStyle::Document lines={#{size}}>"
216
196
  end
217
-
218
-
219
- class Page
220
- extend StringUtils
221
-
222
- class << self
223
- def parse(lines)
224
- pages, current, width = [], [], 0
225
-
226
- lines.each do |line|
227
- if page_break?(line.value)
228
- unless current.empty?
229
- pages << new(current, width: width)
230
- end
231
-
232
- current = [line]
233
- width = display_width(line.value)
234
- else
235
- current << line
236
- width = [width, display_width(line.value)].max
237
- end
238
- end
239
-
240
- unless current.empty?
241
- pages << new(current, width: width)
242
- end
243
-
244
- pages
245
- end
246
- end
247
-
248
- attr_accessor :lines, :width
249
-
250
- def initialize(lines = [], width: 0)
251
- @lines = lines
252
- @width = width
253
- end
254
-
255
- def size
256
- lines.size
257
- end
258
-
259
- def inspect
260
- "#<AnyStyle::Document::Page size={#{size}} width={#{width}}>"
261
- end
262
- end
263
197
  end
264
198
  end
@@ -1,8 +1,8 @@
1
1
  module AnyStyle
2
2
  class Feature
3
3
  class Line < Feature
4
- def observe(token, page:, **opts)
5
- chars = display_chars(token).rstrip
4
+ def observe(token, page:, seq:, **opts)
5
+ chars = display_chars(token)
6
6
 
7
7
  lttrs = count(chars, /\p{L}/)
8
8
  upper = count(chars, /\p{Lu}/)
@@ -18,22 +18,37 @@ module AnyStyle
18
18
  ratio(white, chars.length),
19
19
  ratio(punct, chars.length),
20
20
  ratio(width, page.width),
21
- classify(chars)
21
+ classify(chars),
22
+ page_ratio(seq.line_counts[chars], seq.pages.length),
23
+ page_ratio(seq.nnum_counts[nnum(chars)], seq.pages.length)
22
24
  ]
23
25
  end
24
26
 
25
27
  def classify(chars)
26
- case chars
27
- when /\.\s*\.\s*\.\s*\.|……+/
28
+ case chars.lstrip
29
+ when /\.\s*\.\s*\.\s*\.|……+/, /\p{L}\s{5,}\d+$/
28
30
  :toc
29
- when /\s\s\s\d+$/
30
- :num
31
- when /^\s*(Table|Fig(ure|\.))/
31
+ when /^[\[\(]?\d+\.?[\]\)]?\s+\p{L}+/
32
+ :list
33
+ when /^(\p{Lu}\.?)\s*(\d+\.)+\s+\p{L}+/
34
+ :title
35
+ when /^(\w+\s)?(tab(le|elle|\.)|fig(ure|\.))/i
32
36
  :cap
37
+ when /^\p{Pd}?\d+\p{Pd}?$/, /^[ivx]+$/i
38
+ :num
39
+ when /copyright|©|rights reserved/i
40
+ :copyright
41
+ when /https?:\/\//i
42
+ :http
33
43
  else
34
44
  :none
35
45
  end
36
46
  end
47
+
48
+ def page_ratio(a, b)
49
+ r = a.to_f / b
50
+ r == 1 ? '=' : r > 1 ? '+' : (r * 10).round
51
+ end
37
52
  end
38
53
  end
39
54
  end
@@ -3,8 +3,8 @@ module AnyStyle
3
3
  class Ref < Feature
4
4
  def observe(token, **opts)
5
5
  [
6
- symbolize(count(token, /\b(1\d|20)\d\d\b/)),
7
- symbolize(count(token, /(\d[\(:;]\d)|(\d\s*\p{Pd}+\s*\d)|\bpp?\.|\bvols?\.|\bnos?\./i)),
6
+ symbolize(count(token, /\b(1[4-9]|20)\d\d\b/)),
7
+ symbolize(count(token, /(\d[\(:;]\d)|(\d\s*\p{Pd}+\s*\d)|\bpp?\.|\bvols?\.|\b(nos?|nr|iss?|fasc)\.|n°|nº/i)),
8
8
  symbolize(count(token, /\b\p{Lu}\./)),
9
9
  symbolize(count(token, /\b(eds?\.|edited by|editors?|hg|hrsg|et al)\b/i)),
10
10
  token =~ /^\s*(\[\w+\]|\(\d+\)|\d+\.)\s+/ ? 'T' : 'F'
@@ -13,8 +13,8 @@ module AnyStyle
13
13
 
14
14
  def symbolize(k)
15
15
  return '-' if k < 1
16
- return '+' if k < 3
17
- return '++'
16
+ return '+' if k < 2
17
+ return '*'
18
18
  end
19
19
  end
20
20
  end
@@ -63,18 +63,18 @@ module AnyStyle
63
63
  dataset.map { |doc| doc.references(**opts) }
64
64
  end
65
65
 
66
- def label(input, layout: true, **opts)
67
- dataset = prepare(input, layout: layout, **opts)
66
+ def label(input, layout: true, crop: false, **opts)
67
+ dataset = prepare(input, layout: layout, crop: crop, **opts)
68
68
  output = model.label(dataset, **opts)
69
69
  Wapiti::Dataset.new(dataset.map.with_index { |doc, idx|
70
70
  doc.label(output[idx])
71
71
  })
72
72
  end
73
73
 
74
- def prepare(input, layout: true, **opts)
74
+ def prepare(input, layout: true, crop: false, **opts)
75
75
  case input
76
76
  when String
77
- super(Document.open(input, layout: layout, **opts), **opts)
77
+ super(Document.open(input, layout: layout, crop: false, **opts), **opts)
78
78
  when Array
79
79
  super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **opts) }), **opts)
80
80
  else
@@ -1,5 +1,6 @@
1
1
  module AnyStyle
2
2
  maybe_require 'language_detector'
3
+ maybe_require 'unicode/scripts'
3
4
 
4
5
  class Normalizer
5
6
  class Locale < Normalizer
@@ -8,23 +9,33 @@ module AnyStyle
8
9
  end
9
10
 
10
11
  def normalize(item, **opts)
11
- return item if @ld.nil? || item.key?(:language)
12
-
13
12
  sample = item.values_at(
14
13
  :title,
15
14
  :'container-title',
16
- # :'collection-title',
15
+ :'collection-title',
17
16
  :location,
18
17
  :journal,
19
- :publisher
20
- # :note
18
+ :publisher,
19
+ :note
21
20
  ).flatten.compact.join(' ')
22
21
 
23
22
  return item if sample.empty?
24
23
 
25
- item[:language] = @ld.detect(sample)
24
+ language = detect_language(sample)
25
+ scripts = detect_scripts(sample)
26
+
27
+ item[:language] ||= language unless language.nil?
28
+ item[:scripts] ||= scripts unless scripts.nil?
26
29
  item
27
30
  end
28
31
  end
32
+
33
+ def detect_language(string)
34
+ @ld.detect(string) unless @ld.nil?
35
+ end
36
+
37
+ def detect_scripts(string)
38
+ ::Unicode::Scripts.scripts(string) if defined?(::Unicode::Scripts)
39
+ end
29
40
  end
30
41
  end
@@ -37,7 +37,7 @@ module AnyStyle
37
37
  end
38
38
 
39
39
  def repeater?(value)
40
- value =~ /^[\p{Pd}_*][\p{Pd}_* ]+(,|:|\.|$)/
40
+ value =~ /^([\p{Pd}_*][\p{Pd}_* ]+|\p{Co})(,|:|\.|$)/
41
41
  end
42
42
 
43
43
  def strip(value)