anystyle 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/HISTORY.md +78 -0
  3. data/LICENSE +27 -0
  4. data/README.md +103 -0
  5. data/lib/anystyle.rb +71 -0
  6. data/lib/anystyle/dictionary.rb +132 -0
  7. data/lib/anystyle/dictionary/gdbm.rb +52 -0
  8. data/lib/anystyle/dictionary/lmdb.rb +67 -0
  9. data/lib/anystyle/dictionary/marshal.rb +27 -0
  10. data/lib/anystyle/dictionary/redis.rb +55 -0
  11. data/lib/anystyle/document.rb +264 -0
  12. data/lib/anystyle/errors.rb +14 -0
  13. data/lib/anystyle/feature.rb +27 -0
  14. data/lib/anystyle/feature/affix.rb +43 -0
  15. data/lib/anystyle/feature/brackets.rb +32 -0
  16. data/lib/anystyle/feature/canonical.rb +13 -0
  17. data/lib/anystyle/feature/caps.rb +20 -0
  18. data/lib/anystyle/feature/category.rb +70 -0
  19. data/lib/anystyle/feature/dictionary.rb +16 -0
  20. data/lib/anystyle/feature/indent.rb +16 -0
  21. data/lib/anystyle/feature/keyword.rb +52 -0
  22. data/lib/anystyle/feature/line.rb +39 -0
  23. data/lib/anystyle/feature/locator.rb +18 -0
  24. data/lib/anystyle/feature/number.rb +39 -0
  25. data/lib/anystyle/feature/position.rb +28 -0
  26. data/lib/anystyle/feature/punctuation.rb +22 -0
  27. data/lib/anystyle/feature/quotes.rb +20 -0
  28. data/lib/anystyle/feature/ref.rb +21 -0
  29. data/lib/anystyle/feature/terminal.rb +19 -0
  30. data/lib/anystyle/feature/words.rb +74 -0
  31. data/lib/anystyle/finder.rb +94 -0
  32. data/lib/anystyle/format/bibtex.rb +63 -0
  33. data/lib/anystyle/format/csl.rb +28 -0
  34. data/lib/anystyle/normalizer.rb +65 -0
  35. data/lib/anystyle/normalizer/brackets.rb +13 -0
  36. data/lib/anystyle/normalizer/container.rb +13 -0
  37. data/lib/anystyle/normalizer/date.rb +109 -0
  38. data/lib/anystyle/normalizer/edition.rb +16 -0
  39. data/lib/anystyle/normalizer/journal.rb +14 -0
  40. data/lib/anystyle/normalizer/locale.rb +30 -0
  41. data/lib/anystyle/normalizer/location.rb +24 -0
  42. data/lib/anystyle/normalizer/locator.rb +22 -0
  43. data/lib/anystyle/normalizer/names.rb +88 -0
  44. data/lib/anystyle/normalizer/page.rb +29 -0
  45. data/lib/anystyle/normalizer/publisher.rb +18 -0
  46. data/lib/anystyle/normalizer/pubmed.rb +18 -0
  47. data/lib/anystyle/normalizer/punctuation.rb +23 -0
  48. data/lib/anystyle/normalizer/quotes.rb +14 -0
  49. data/lib/anystyle/normalizer/type.rb +54 -0
  50. data/lib/anystyle/normalizer/volume.rb +26 -0
  51. data/lib/anystyle/parser.rb +199 -0
  52. data/lib/anystyle/support.rb +4 -0
  53. data/lib/anystyle/support/finder.mod +3234 -0
  54. data/lib/anystyle/support/finder.txt +75 -0
  55. data/lib/anystyle/support/parser.mod +15025 -0
  56. data/lib/anystyle/support/parser.txt +75 -0
  57. data/lib/anystyle/utils.rb +70 -0
  58. data/lib/anystyle/version.rb +3 -0
  59. data/res/finder/bb132pr2055.ttx +6803 -0
  60. data/res/finder/bb550sh8053.ttx +18660 -0
  61. data/res/finder/bb599nz4341.ttx +2957 -0
  62. data/res/finder/bb725rt6501.ttx +15276 -0
  63. data/res/finder/bc605xz1554.ttx +18815 -0
  64. data/res/finder/bd040gx5718.ttx +4271 -0
  65. data/res/finder/bd413nt2715.ttx +4956 -0
  66. data/res/finder/bd466fq0394.ttx +6100 -0
  67. data/res/finder/bf668vw2021.ttx +3578 -0
  68. data/res/finder/bg495cx0468.ttx +7267 -0
  69. data/res/finder/bg599vt3743.ttx +6752 -0
  70. data/res/finder/bg608dx2253.ttx +4094 -0
  71. data/res/finder/bh410qk3771.ttx +8785 -0
  72. data/res/finder/bh989ww6442.ttx +17204 -0
  73. data/res/finder/bj581pc8202.ttx +2719 -0
  74. data/res/parser/bad.xml +5199 -0
  75. data/res/parser/core.xml +7924 -0
  76. data/res/parser/gold.xml +2707 -0
  77. data/res/parser/good.xml +34281 -0
  78. data/res/parser/stanford-books.xml +2280 -0
  79. data/res/parser/stanford-diss.xml +726 -0
  80. data/res/parser/stanford-theses.xml +4684 -0
  81. data/res/parser/ugly.xml +33246 -0
  82. metadata +195 -0
@@ -0,0 +1,67 @@
1
+ module AnyStyle
2
+ require 'lmdb'
3
+
4
+ class Dictionary
5
+ class LMDB < Dictionary
6
+ @defaults = {
7
+ path: File.expand_path('../data', __FILE__),
8
+ mapsize: 1 << 22,
9
+ writemap: true,
10
+ mapasync: true
11
+ }
12
+
13
+ attr_reader :env
14
+
15
+ def initialize(options = {})
16
+ super(self.class.defaults.merge(options))
17
+ end
18
+
19
+ def open
20
+ unless open?
21
+ @env = ::LMDB.new(path, lmdb_options)
22
+ @db = @env.database create: true
23
+ end
24
+
25
+ self
26
+ ensure
27
+ populate! if empty?
28
+ end
29
+
30
+ def close
31
+ env.close if open?
32
+ end
33
+
34
+ def open?
35
+ !db.nil?
36
+ end
37
+
38
+ def empty?
39
+ open? and db.size == 0
40
+ end
41
+
42
+ def truncate
43
+ close
44
+ %w{ data.mdb lock.mdb }.each do |mdb|
45
+ mdb = File.join(path, mdb)
46
+ File.unlink(mdb) if File.exists?(mdb)
47
+ end
48
+ end
49
+
50
+ def get(key)
51
+ db[key.to_s].to_i
52
+ end
53
+
54
+ def put(key, value)
55
+ db[key.to_s] = value.to_i.to_s
56
+ end
57
+
58
+ def path
59
+ options[:path]
60
+ end
61
+
62
+ def lmdb_options
63
+ options.reject { |k| [:path, :source].include?(k) }
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,27 @@
1
+ module AnyStyle
2
+ class Dictionary
3
+ class Marshal < Dictionary
4
+ @defaults = {
5
+ path: File.expand_path('../../data/dict.marshal', __FILE__)
6
+ }
7
+
8
+ def initialize(options = {})
9
+ super(self.class.defaults.merge(options))
10
+ end
11
+
12
+ def open
13
+ if File.exists?(options[:path])
14
+ @db = ::Marshal.load(File.open(options[:path]))
15
+ else
16
+ @db = {}
17
+ end
18
+ self
19
+ ensure
20
+ if empty?
21
+ populate!
22
+ ::Marshal.dump(db, File.open(options[:path], 'wb'))
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,55 @@
1
+ module AnyStyle
2
+ require 'redis'
3
+ maybe_require 'redis/namespace'
4
+
5
+ class Dictionary
6
+ class Redis < Dictionary
7
+ @defaults = {
8
+ namespace: 'anystyle',
9
+ port: 6379
10
+ }
11
+
12
+ def initialize(options = {})
13
+ super(self.class.defaults.merge(options))
14
+ end
15
+
16
+ def open
17
+ unless open?
18
+ @db = ::Redis.new(options)
19
+
20
+ unless namespace.nil? or not defined?(::Redis::Namespace)
21
+ @db = ::Redis::Namespace.new namespace, redis: @db
22
+ end
23
+ end
24
+
25
+ self
26
+ ensure
27
+ populate! if empty?
28
+ end
29
+
30
+ def close
31
+ db.close
32
+ end
33
+
34
+ def open?
35
+ not db.nil?
36
+ end
37
+
38
+ def empty?
39
+ open? and db.dbsize == 0
40
+ end
41
+
42
+ def get(key)
43
+ db[key.to_s].to_i
44
+ end
45
+
46
+ def put(key, value)
47
+ db[key.to_s] = value.to_i
48
+ end
49
+
50
+ def namespace
51
+ options[:namespace]
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,264 @@
1
+ module AnyStyle
2
+ class Document < Wapiti::Sequence
3
+ class << self
4
+ include PdfUtils
5
+
6
+ def parse(string, delimiter: /\r?\n/, tagged: false)
7
+ current_label = ''
8
+ new(string.split(delimiter).map { |line|
9
+ if tagged
10
+ label, line = line.split(/\s*\| /, 2)
11
+ current_label = label unless label.empty?
12
+ end
13
+ Wapiti::Token.new line, label: current_label.to_s
14
+ })
15
+ end
16
+
17
+ def open(path, format: File.extname(path), tagged: false, layout: true, **opts)
18
+ raise ArgumentError,
19
+ "cannot open tainted path: '#{path}'" if path.tainted?
20
+ raise ArgumentError,
21
+ "document not found: '#{path}'" unless File.exist?(path)
22
+
23
+ path = File.absolute_path(path)
24
+
25
+ case format.downcase
26
+ when '.pdf'
27
+ meta = pdf_meta path if opts[:parse_meta]
28
+ info = pdf_info path if opts[:parse_info]
29
+ input = pdf_to_text path, layout: layout
30
+ when '.ttx'
31
+ tagged = true
32
+ input = File.read(path, encoding: 'utf-8')
33
+ when '.txt'
34
+ input = File.read(path, encoding: 'utf-8')
35
+ end
36
+
37
+ doc = parse input, tagged: tagged
38
+ doc.path = path
39
+ doc.meta = meta
40
+ doc.info = info
41
+ doc
42
+ end
43
+ end
44
+
45
+ include StringUtils
46
+
47
+ attr_accessor :meta, :info, :path, :pages, :tokens
48
+ alias_method :lines, :tokens
49
+
50
+ def pages
51
+ @pages ||= Page.parse(lines)
52
+ end
53
+
54
+ def each
55
+ if block_given?
56
+ pages.each.with_index do |page, pn|
57
+ page.lines.each.with_index do |line, ln|
58
+ yield line, ln, page, pn
59
+ end
60
+ end
61
+ self
62
+ else
63
+ to_enum
64
+ end
65
+ end
66
+
67
+ def each_section
68
+ if block_given?
69
+ current = []
70
+ lines.each do |ln|
71
+ case ln.label
72
+ when 'title'
73
+ unless current.empty?
74
+ yield current
75
+ current = []
76
+ end
77
+ when 'ref', 'text'
78
+ current << ln
79
+ else
80
+ # ignore
81
+ end
82
+ end
83
+ unless current.empty?
84
+ yield current
85
+ end
86
+ self
87
+ else
88
+ to_enum
89
+ end
90
+ end
91
+
92
+ def label(other)
93
+ doc = dup
94
+ doc.tokens = lines.map.with_index { |line, idx|
95
+ Wapiti::Token.new line.value,
96
+ label: other[idx].label.to_s,
97
+ observations: other[idx].observations.dup
98
+ }
99
+ doc
100
+ end
101
+
102
+ def to_s(delimiter: "\n", encode: false, tagged: false, **opts)
103
+ if tagged
104
+ prev_label = nil
105
+ lines.map { |ln|
106
+ label = (ln.label == prev_label) ? '' : ln.label
107
+ prev_label = ln.label
108
+ '%.14s| %s' % ["#{label} ", ln.value]
109
+ }.join(delimiter)
110
+ else
111
+ super(delimiter: delimiter, encode: encode, tagged: tagged, expanded: false, **opts)
112
+ end
113
+ end
114
+
115
+ def to_a(encode: true, **opts)
116
+ super(encode: encode, **opts)
117
+ end
118
+
119
+ def to_h(**opts)
120
+ {
121
+ info: info,
122
+ meta: meta,
123
+ sections: sections(**opts),
124
+ title: title(**opts),
125
+ references: references(**opts)
126
+ }
127
+ end
128
+
129
+ def references(**opts)
130
+ bib, current, delta, indent = [], nil, 0, 0
131
+
132
+ lines.each do |ln|
133
+ case ln.label
134
+ when 'ref'
135
+ val = display_chars(ln.value).rstrip
136
+ idt = val[/^\s*/].length
137
+ val.lstrip!
138
+
139
+ if current.nil?
140
+ current, delta, indent = val, 0, idt
141
+ else
142
+ if join_refs?(current, val, delta, idt - indent)
143
+ current = join_refs(current, val)
144
+ else
145
+ bib << current
146
+ current, delta, indent = val, 0, idt
147
+ end
148
+ end
149
+ else
150
+ unless current.nil?
151
+ if delta > 15 || %w{ blank meta }.include?(ln.label)
152
+ delta += 1
153
+ else
154
+ bib << current
155
+ current, delta, indent = nil, 0, idt
156
+ end
157
+ end
158
+ end
159
+ end
160
+
161
+ unless current.nil?
162
+ bib << current
163
+ end
164
+
165
+ bib
166
+ end
167
+
168
+ def join_refs?(a, b, delta = 0, indent = 0)
169
+ pro = [
170
+ indent > 0,
171
+ delta == 0,
172
+ b.length < 50,
173
+ a.length < 65,
174
+ a.match?(/[,;:&\p{Pd}]$/),
175
+ b.match?(/^\p{Ll}/) || a.match?(/\p{L}$/) && b.match?(/^\p{L}/)
176
+ ].count(true)
177
+
178
+ con = [
179
+ indent < 0,
180
+ delta > 8,
181
+ a.match?(/\.\]$/),
182
+ a.length > 500,
183
+ (b.length - a.length) > 12,
184
+ b.match?(/^(\p{Pd}\p{Pd}|\p{Lu}\p{Ll}+, \p{Lu}\.|\[\d)/)
185
+ ].count(true)
186
+
187
+ (pro - con) > 1
188
+ end
189
+
190
+ def join_refs(a, b)
191
+ if a[-1] == '-'
192
+ if b =~ /^\p{Ll}/
193
+ "#{a[0...-1]}#{b}"
194
+ else
195
+ "#{a}#{b}"
196
+ end
197
+ else
198
+ "#{a} #{b}"
199
+ end
200
+ end
201
+
202
+ def sections(delimiter: "\n", **opts)
203
+ []
204
+ end
205
+
206
+ def title(delimiter: " ", **opts)
207
+ lines.drop_while { |ln|
208
+ ln.label != 'title'
209
+ }.take_while { |ln|
210
+ ln.label == 'title'
211
+ }.map(&:value).join(delimiter)
212
+ end
213
+
214
+ def inspect
215
+ "#<AnyStyle::Document lines={#{size}}>"
216
+ end
217
+
218
+
219
+ class Page
220
+ extend StringUtils
221
+
222
+ class << self
223
+ def parse(lines)
224
+ pages, current, width = [], [], 0
225
+
226
+ lines.each do |line|
227
+ if page_break?(line.value)
228
+ unless current.empty?
229
+ pages << new(current, width: width)
230
+ end
231
+
232
+ current = [line]
233
+ width = display_width(line.value)
234
+ else
235
+ current << line
236
+ width = [width, display_width(line.value)].max
237
+ end
238
+ end
239
+
240
+ unless current.empty?
241
+ pages << new(current, width: width)
242
+ end
243
+
244
+ pages
245
+ end
246
+ end
247
+
248
+ attr_accessor :lines, :width
249
+
250
+ def initialize(lines = [], width: 0)
251
+ @lines = lines
252
+ @width = width
253
+ end
254
+
255
+ def size
256
+ lines.size
257
+ end
258
+
259
+ def inspect
260
+ "#<AnyStyle::Document::Page size={#{size}} width={#{width}}>"
261
+ end
262
+ end
263
+ end
264
+ end
@@ -0,0 +1,14 @@
1
+ module AnyStyle
2
+ class Error < StandardError
3
+
4
+ attr_accessor :original
5
+
6
+ def initialize(message = nil, original = $!)
7
+ super(message)
8
+ @original = original
9
+ end
10
+
11
+ end
12
+
13
+ class TrainingError < Error; end
14
+ end
@@ -0,0 +1,27 @@
1
+ module AnyStyle
2
+ class Feature
3
+ include StringUtils
4
+
5
+ attr_reader :precision
6
+
7
+ def initialize(precision: 10, **opts)
8
+ @precision = precision
9
+ end
10
+
11
+ def observe(token, **opts)
12
+ raise NotImplementedError
13
+ end
14
+
15
+ def next(idx, seq)
16
+ sequence[idx + 1]
17
+ end
18
+
19
+ def prev(idx, seq)
20
+ idx == 0 ? nil : seq[idx - 1]
21
+ end
22
+
23
+ def ratio(x, y)
24
+ (y > 0) ? ((x.to_f / y) * precision).round : 0
25
+ end
26
+ end
27
+ end