anystyle 1.1.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: 411233e757de4873a040de0adac8a862e9a8d77fe8844128ea5279f774608f45
4
- data.tar.gz: '09133619934b406c21a75447d19c7b81d672bcd61e19d6f2bb0064e6b8bda718'
2
+ SHA1:
3
+ metadata.gz: 3b74980250ee2a9627b97a9f91e6c11052ddd1e0
4
+ data.tar.gz: 645f6371761078aa5d4b49c0905a749eda55dd3b
5
5
  SHA512:
6
- metadata.gz: 164b640e6b8c79a4d19ff82da141acf6d75cf133cff56624440ca63a5803a4ba457d65090bde7cb1074b503390e99430a7aaf911bd647b0484a9d982ea6fbdb2
7
- data.tar.gz: ae83ce33900704257bc7ac2fc21d278125098821bcf6577bec868779a009e96743f3623f41078255f4a6b56c6b826e5899159dc2564af0f41c65035ae238af1b
6
+ metadata.gz: 449c2ffca67851254e4e9a98fa5618a57928147e3fda613b908ab83bc8d4d096cec7f11bae9f7727fbcad69dfac73d6f4f1034f00cece3c30a7813bfc7c2c20e
7
+ data.tar.gz: c14d4637d11689eb88727baabcdc30c672642d93dad810c15b86e2197835c75360c21ccf03bbdf9da3e909e0f0777d55f9ef81218700231db8ea5504051f8699
data/HISTORY.md CHANGED
@@ -1,3 +1,12 @@
1
+ 1.2.0 / 2018-08-16
2
+ ==================
3
+ * Added check and train commands to CLI.
4
+ * Added --no-solo and --crop flags to find command.
5
+ * Added reference block normalizer.
6
+ * Added script detection normalizer.
7
+ * Improved Finder reference line joining.
8
+ * Improved Finder model; training sets.
9
+
1
10
  1.1.0 / 2018-07-11
2
11
  ==================
3
12
  * Improved Parser model; training sets.
@@ -49,6 +49,8 @@ require 'anystyle/normalizer/volume'
49
49
  require 'anystyle/format/bibtex'
50
50
  require 'anystyle/format/csl'
51
51
 
52
+ require 'anystyle/page'
53
+ require 'anystyle/refs'
52
54
  require 'anystyle/document'
53
55
  require 'anystyle/parser'
54
56
  require 'anystyle/finder'
@@ -1,7 +1,10 @@
1
1
  module AnyStyle
2
2
  class Document < Wapiti::Sequence
3
+
4
+ REFSECT = /references|referenzen|cited|bibliogra|secondary sources|literatur/i
5
+
3
6
  class << self
4
- include PdfUtils
7
+ include PDFUtils
5
8
 
6
9
  def parse(string, delimiter: /\r?\n/, tagged: false)
7
10
  current_label = ''
@@ -14,7 +17,7 @@ module AnyStyle
14
17
  })
15
18
  end
16
19
 
17
- def open(path, format: File.extname(path), tagged: false, layout: true, **opts)
20
+ def open(path, format: File.extname(path), tagged: false, **opts)
18
21
  raise ArgumentError,
19
22
  "cannot open tainted path: '#{path}'" if path.tainted?
20
23
  raise ArgumentError,
@@ -26,7 +29,7 @@ module AnyStyle
26
29
  when '.pdf'
27
30
  meta = pdf_meta path if opts[:parse_meta]
28
31
  info = pdf_info path if opts[:parse_info]
29
- input = pdf_to_text path, layout: layout
32
+ input = pdf_to_text path, **opts
30
33
  when '.ttx'
31
34
  tagged = true
32
35
  input = File.read(path, encoding: 'utf-8')
@@ -47,8 +50,16 @@ module AnyStyle
47
50
  attr_accessor :meta, :info, :path, :pages, :tokens
48
51
  alias_method :lines, :tokens
49
52
 
53
+ def line_counts
54
+ @line_counts ||= Hash.new(0)
55
+ end
56
+
57
+ def nnum_counts
58
+ @nnum_counts ||= Hash.new(0)
59
+ end
60
+
50
61
  def pages
51
- @pages ||= Page.parse(lines)
62
+ @pages ||= Page.parse(lines, self)
52
63
  end
53
64
 
54
65
  def each
@@ -64,28 +75,34 @@ module AnyStyle
64
75
  end
65
76
  end
66
77
 
67
- def each_section
78
+ def each_section(skip: ['meta'])
68
79
  if block_given?
69
- current = []
80
+ head = []
81
+ body = []
82
+ seen_content = false
83
+
70
84
  lines.each do |ln|
71
85
  case ln.label
72
86
  when 'title'
73
- unless current.empty?
74
- yield current
75
- current = []
87
+ if seen_content
88
+ yield [head, body]
89
+ head, body, seen_content = [ln], [], false
90
+ else
91
+ head << ln
76
92
  end
77
93
  when 'ref', 'text'
78
- current << ln
94
+ body << ln
95
+ seen_content = true
79
96
  else
80
- # ignore
97
+ body << ln unless skip.include?(ln.label)
81
98
  end
82
99
  end
83
- unless current.empty?
84
- yield current
100
+ unless head.empty?
101
+ yield [head, body]
85
102
  end
86
103
  self
87
104
  else
88
- to_enum
105
+ to_enum :each_section
89
106
  end
90
107
  end
91
108
 
@@ -94,7 +111,8 @@ module AnyStyle
94
111
  doc.tokens = lines.map.with_index { |line, idx|
95
112
  Wapiti::Token.new line.value,
96
113
  label: other[idx].label.to_s,
97
- observations: other[idx].observations.dup
114
+ observations: other[idx].observations.dup,
115
+ score: other[idx].score
98
116
  }
99
117
  doc
100
118
  end
@@ -126,83 +144,45 @@ module AnyStyle
126
144
  }
127
145
  end
128
146
 
129
- def references(**opts)
130
- bib, current, delta, indent = [], nil, 0, 0
131
-
132
- lines.each do |ln|
133
- case ln.label
134
- when 'ref'
135
- val = display_chars(ln.value).rstrip
136
- idt = val[/^\s*/].length
137
- val.lstrip!
138
-
139
- if current.nil?
140
- current, delta, indent = val, 0, idt
141
- else
142
- if join_refs?(current, val, delta, idt - indent)
143
- current = join_refs(current, val)
144
- else
145
- bib << current
146
- current, delta, indent = val, 0, idt
147
- end
148
- end
149
- else
150
- unless current.nil?
151
- if delta > 15 || %w{ blank meta }.include?(ln.label)
152
- delta += 1
153
- else
154
- bib << current
155
- current, delta, indent = nil, 0, idt
147
+ def references(normalize_blocks: false, **opts)
148
+ if normalize_blocks
149
+ each_section.inject([]) do |refs, (head, body)|
150
+ rc = body.count { |tk| tk.label == 'ref' }
151
+ unless rc == 0
152
+ tc = body.count { |tk| tk.label == 'text' }
153
+ is_ref_sect = !head.find { |tk| tk.value =~ REFSECT }.nil?
154
+
155
+ # Skip sections with few ref lines!
156
+ if is_ref_sect || include_references?(rc, tc)
157
+ Refs.normalize! body, max_win_size: is_ref_sect ? 6 : 2
158
+ refs.concat Refs.parse(body).to_a
156
159
  end
157
160
  end
158
- end
159
- end
160
161
 
161
- unless current.nil?
162
- bib << current
162
+ refs
163
+ end
164
+ else
165
+ Refs.parse(lines).to_a
163
166
  end
164
-
165
- bib
166
167
  end
167
168
 
168
- def join_refs?(a, b, delta = 0, indent = 0)
169
- pro = [
170
- indent > 0,
171
- delta == 0,
172
- b.length < 50,
173
- a.length < 65,
174
- !!a.match(/[,;:&\p{Pd}]$/),
175
- !!b.match(/^\p{Ll}/) || !!a.match(/\p{L}$/) && !!b.match(/^\p{L}/)
176
- ].count(true)
177
-
178
- con = [
179
- indent < 0,
180
- delta > 8,
181
- !!a.match(/\.\]$/),
182
- a.length > 500,
183
- (b.length - a.length) > 12,
184
- !!b.match(/^(\p{Pd}\p{Pd}|\p{Lu}\p{Ll}+, \p{Lu}\.|\[\d)/)
185
- ].count(true)
186
-
187
- (pro - con) > 1
169
+ def include_references?(rc, tc)
170
+ rc > 10 || (rc + tc) > 20 && (rc.to_f / tc) > 0.2
188
171
  end
189
172
 
190
- def join_refs(a, b)
191
- if a[-1] == '-'
192
- if b =~ /^\p{Ll}/
193
- "#{a[0...-1]}#{b}"
194
- else
195
- "#{a}#{b}"
196
- end
197
- else
198
- "#{a} #{b}"
173
+ def sections(delimiter: "\n", spacer: ' ', **opts)
174
+ each_section.map do |(head, body)|
175
+ {
176
+ title: head.map { |tk|
177
+ display_chars(tk.value).lstrip.unicode_normalize
178
+ }.join(spacer),
179
+ text: body.map { |tk|
180
+ display_chars(tk.value).unicode_normalize
181
+ }.join(delimiter)
182
+ }
199
183
  end
200
184
  end
201
185
 
202
- def sections(delimiter: "\n", **opts)
203
- []
204
- end
205
-
206
186
  def title(delimiter: " ", **opts)
207
187
  lines.drop_while { |ln|
208
188
  ln.label != 'title'
@@ -214,51 +194,5 @@ module AnyStyle
214
194
  def inspect
215
195
  "#<AnyStyle::Document lines={#{size}}>"
216
196
  end
217
-
218
-
219
- class Page
220
- extend StringUtils
221
-
222
- class << self
223
- def parse(lines)
224
- pages, current, width = [], [], 0
225
-
226
- lines.each do |line|
227
- if page_break?(line.value)
228
- unless current.empty?
229
- pages << new(current, width: width)
230
- end
231
-
232
- current = [line]
233
- width = display_width(line.value)
234
- else
235
- current << line
236
- width = [width, display_width(line.value)].max
237
- end
238
- end
239
-
240
- unless current.empty?
241
- pages << new(current, width: width)
242
- end
243
-
244
- pages
245
- end
246
- end
247
-
248
- attr_accessor :lines, :width
249
-
250
- def initialize(lines = [], width: 0)
251
- @lines = lines
252
- @width = width
253
- end
254
-
255
- def size
256
- lines.size
257
- end
258
-
259
- def inspect
260
- "#<AnyStyle::Document::Page size={#{size}} width={#{width}}>"
261
- end
262
- end
263
197
  end
264
198
  end
@@ -1,8 +1,8 @@
1
1
  module AnyStyle
2
2
  class Feature
3
3
  class Line < Feature
4
- def observe(token, page:, **opts)
5
- chars = display_chars(token).rstrip
4
+ def observe(token, page:, seq:, **opts)
5
+ chars = display_chars(token)
6
6
 
7
7
  lttrs = count(chars, /\p{L}/)
8
8
  upper = count(chars, /\p{Lu}/)
@@ -18,22 +18,37 @@ module AnyStyle
18
18
  ratio(white, chars.length),
19
19
  ratio(punct, chars.length),
20
20
  ratio(width, page.width),
21
- classify(chars)
21
+ classify(chars),
22
+ page_ratio(seq.line_counts[chars], seq.pages.length),
23
+ page_ratio(seq.nnum_counts[nnum(chars)], seq.pages.length)
22
24
  ]
23
25
  end
24
26
 
25
27
  def classify(chars)
26
- case chars
27
- when /\.\s*\.\s*\.\s*\.|……+/
28
+ case chars.lstrip
29
+ when /\.\s*\.\s*\.\s*\.|……+/, /\p{L}\s{5,}\d+$/
28
30
  :toc
29
- when /\s\s\s\d+$/
30
- :num
31
- when /^\s*(Table|Fig(ure|\.))/
31
+ when /^[\[\(]?\d+\.?[\]\)]?\s+\p{L}+/
32
+ :list
33
+ when /^(\p{Lu}\.?)\s*(\d+\.)+\s+\p{L}+/
34
+ :title
35
+ when /^(\w+\s)?(tab(le|elle|\.)|fig(ure|\.))/i
32
36
  :cap
37
+ when /^\p{Pd}?\d+\p{Pd}?$/, /^[ivx]+$/i
38
+ :num
39
+ when /copyright|©|rights reserved/i
40
+ :copyright
41
+ when /https?:\/\//i
42
+ :http
33
43
  else
34
44
  :none
35
45
  end
36
46
  end
47
+
48
+ def page_ratio(a, b)
49
+ r = a.to_f / b
50
+ r == 1 ? '=' : r > 1 ? '+' : (r * 10).round
51
+ end
37
52
  end
38
53
  end
39
54
  end
@@ -3,8 +3,8 @@ module AnyStyle
3
3
  class Ref < Feature
4
4
  def observe(token, **opts)
5
5
  [
6
- symbolize(count(token, /\b(1\d|20)\d\d\b/)),
7
- symbolize(count(token, /(\d[\(:;]\d)|(\d\s*\p{Pd}+\s*\d)|\bpp?\.|\bvols?\.|\bnos?\./i)),
6
+ symbolize(count(token, /\b(1[4-9]|20)\d\d\b/)),
7
+ symbolize(count(token, /(\d[\(:;]\d)|(\d\s*\p{Pd}+\s*\d)|\bpp?\.|\bvols?\.|\b(nos?|nr|iss?|fasc)\.|n°|nº/i)),
8
8
  symbolize(count(token, /\b\p{Lu}\./)),
9
9
  symbolize(count(token, /\b(eds?\.|edited by|editors?|hg|hrsg|et al)\b/i)),
10
10
  token =~ /^\s*(\[\w+\]|\(\d+\)|\d+\.)\s+/ ? 'T' : 'F'
@@ -13,8 +13,8 @@ module AnyStyle
13
13
 
14
14
  def symbolize(k)
15
15
  return '-' if k < 1
16
- return '+' if k < 3
17
- return '++'
16
+ return '+' if k < 2
17
+ return '*'
18
18
  end
19
19
  end
20
20
  end
@@ -63,18 +63,18 @@ module AnyStyle
63
63
  dataset.map { |doc| doc.references(**opts) }
64
64
  end
65
65
 
66
- def label(input, layout: true, **opts)
67
- dataset = prepare(input, layout: layout, **opts)
66
+ def label(input, layout: true, crop: false, **opts)
67
+ dataset = prepare(input, layout: layout, crop: crop, **opts)
68
68
  output = model.label(dataset, **opts)
69
69
  Wapiti::Dataset.new(dataset.map.with_index { |doc, idx|
70
70
  doc.label(output[idx])
71
71
  })
72
72
  end
73
73
 
74
- def prepare(input, layout: true, **opts)
74
+ def prepare(input, layout: true, crop: false, **opts)
75
75
  case input
76
76
  when String
77
- super(Document.open(input, layout: layout, **opts), **opts)
77
+ super(Document.open(input, layout: layout, crop: false, **opts), **opts)
78
78
  when Array
79
79
  super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **opts) }), **opts)
80
80
  else
@@ -1,5 +1,6 @@
1
1
  module AnyStyle
2
2
  maybe_require 'language_detector'
3
+ maybe_require 'unicode/scripts'
3
4
 
4
5
  class Normalizer
5
6
  class Locale < Normalizer
@@ -8,23 +9,33 @@ module AnyStyle
8
9
  end
9
10
 
10
11
  def normalize(item, **opts)
11
- return item if @ld.nil? || item.key?(:language)
12
-
13
12
  sample = item.values_at(
14
13
  :title,
15
14
  :'container-title',
16
- # :'collection-title',
15
+ :'collection-title',
17
16
  :location,
18
17
  :journal,
19
- :publisher
20
- # :note
18
+ :publisher,
19
+ :note
21
20
  ).flatten.compact.join(' ')
22
21
 
23
22
  return item if sample.empty?
24
23
 
25
- item[:language] = @ld.detect(sample)
24
+ language = detect_language(sample)
25
+ scripts = detect_scripts(sample)
26
+
27
+ item[:language] ||= language unless language.nil?
28
+ item[:scripts] ||= scripts unless scripts.nil?
26
29
  item
27
30
  end
28
31
  end
32
+
33
+ def detect_language(string)
34
+ @ld.detect(string) unless @ld.nil?
35
+ end
36
+
37
+ def detect_scripts(string)
38
+ ::Unicode::Scripts.scripts(string) if defined?(::Unicode::Scripts)
39
+ end
29
40
  end
30
41
  end
@@ -37,7 +37,7 @@ module AnyStyle
37
37
  end
38
38
 
39
39
  def repeater?(value)
40
- value =~ /^[\p{Pd}_*][\p{Pd}_* ]+(,|:|\.|$)/
40
+ value =~ /^([\p{Pd}_*][\p{Pd}_* ]+|\p{Co})(,|:|\.|$)/
41
41
  end
42
42
 
43
43
  def strip(value)