anystyle 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/HISTORY.md +78 -0
  3. data/LICENSE +27 -0
  4. data/README.md +103 -0
  5. data/lib/anystyle.rb +71 -0
  6. data/lib/anystyle/dictionary.rb +132 -0
  7. data/lib/anystyle/dictionary/gdbm.rb +52 -0
  8. data/lib/anystyle/dictionary/lmdb.rb +67 -0
  9. data/lib/anystyle/dictionary/marshal.rb +27 -0
  10. data/lib/anystyle/dictionary/redis.rb +55 -0
  11. data/lib/anystyle/document.rb +264 -0
  12. data/lib/anystyle/errors.rb +14 -0
  13. data/lib/anystyle/feature.rb +27 -0
  14. data/lib/anystyle/feature/affix.rb +43 -0
  15. data/lib/anystyle/feature/brackets.rb +32 -0
  16. data/lib/anystyle/feature/canonical.rb +13 -0
  17. data/lib/anystyle/feature/caps.rb +20 -0
  18. data/lib/anystyle/feature/category.rb +70 -0
  19. data/lib/anystyle/feature/dictionary.rb +16 -0
  20. data/lib/anystyle/feature/indent.rb +16 -0
  21. data/lib/anystyle/feature/keyword.rb +52 -0
  22. data/lib/anystyle/feature/line.rb +39 -0
  23. data/lib/anystyle/feature/locator.rb +18 -0
  24. data/lib/anystyle/feature/number.rb +39 -0
  25. data/lib/anystyle/feature/position.rb +28 -0
  26. data/lib/anystyle/feature/punctuation.rb +22 -0
  27. data/lib/anystyle/feature/quotes.rb +20 -0
  28. data/lib/anystyle/feature/ref.rb +21 -0
  29. data/lib/anystyle/feature/terminal.rb +19 -0
  30. data/lib/anystyle/feature/words.rb +74 -0
  31. data/lib/anystyle/finder.rb +94 -0
  32. data/lib/anystyle/format/bibtex.rb +63 -0
  33. data/lib/anystyle/format/csl.rb +28 -0
  34. data/lib/anystyle/normalizer.rb +65 -0
  35. data/lib/anystyle/normalizer/brackets.rb +13 -0
  36. data/lib/anystyle/normalizer/container.rb +13 -0
  37. data/lib/anystyle/normalizer/date.rb +109 -0
  38. data/lib/anystyle/normalizer/edition.rb +16 -0
  39. data/lib/anystyle/normalizer/journal.rb +14 -0
  40. data/lib/anystyle/normalizer/locale.rb +30 -0
  41. data/lib/anystyle/normalizer/location.rb +24 -0
  42. data/lib/anystyle/normalizer/locator.rb +22 -0
  43. data/lib/anystyle/normalizer/names.rb +88 -0
  44. data/lib/anystyle/normalizer/page.rb +29 -0
  45. data/lib/anystyle/normalizer/publisher.rb +18 -0
  46. data/lib/anystyle/normalizer/pubmed.rb +18 -0
  47. data/lib/anystyle/normalizer/punctuation.rb +23 -0
  48. data/lib/anystyle/normalizer/quotes.rb +14 -0
  49. data/lib/anystyle/normalizer/type.rb +54 -0
  50. data/lib/anystyle/normalizer/volume.rb +26 -0
  51. data/lib/anystyle/parser.rb +199 -0
  52. data/lib/anystyle/support.rb +4 -0
  53. data/lib/anystyle/support/finder.mod +3234 -0
  54. data/lib/anystyle/support/finder.txt +75 -0
  55. data/lib/anystyle/support/parser.mod +15025 -0
  56. data/lib/anystyle/support/parser.txt +75 -0
  57. data/lib/anystyle/utils.rb +70 -0
  58. data/lib/anystyle/version.rb +3 -0
  59. data/res/finder/bb132pr2055.ttx +6803 -0
  60. data/res/finder/bb550sh8053.ttx +18660 -0
  61. data/res/finder/bb599nz4341.ttx +2957 -0
  62. data/res/finder/bb725rt6501.ttx +15276 -0
  63. data/res/finder/bc605xz1554.ttx +18815 -0
  64. data/res/finder/bd040gx5718.ttx +4271 -0
  65. data/res/finder/bd413nt2715.ttx +4956 -0
  66. data/res/finder/bd466fq0394.ttx +6100 -0
  67. data/res/finder/bf668vw2021.ttx +3578 -0
  68. data/res/finder/bg495cx0468.ttx +7267 -0
  69. data/res/finder/bg599vt3743.ttx +6752 -0
  70. data/res/finder/bg608dx2253.ttx +4094 -0
  71. data/res/finder/bh410qk3771.ttx +8785 -0
  72. data/res/finder/bh989ww6442.ttx +17204 -0
  73. data/res/finder/bj581pc8202.ttx +2719 -0
  74. data/res/parser/bad.xml +5199 -0
  75. data/res/parser/core.xml +7924 -0
  76. data/res/parser/gold.xml +2707 -0
  77. data/res/parser/good.xml +34281 -0
  78. data/res/parser/stanford-books.xml +2280 -0
  79. data/res/parser/stanford-diss.xml +726 -0
  80. data/res/parser/stanford-theses.xml +4684 -0
  81. data/res/parser/ugly.xml +33246 -0
  82. metadata +195 -0
@@ -0,0 +1,67 @@
1
+ module AnyStyle
2
+ require 'lmdb'
3
+
4
+ class Dictionary
5
+ class LMDB < Dictionary
6
+ @defaults = {
7
+ path: File.expand_path('../data', __FILE__),
8
+ mapsize: 1 << 22,
9
+ writemap: true,
10
+ mapasync: true
11
+ }
12
+
13
+ attr_reader :env
14
+
15
+ def initialize(options = {})
16
+ super(self.class.defaults.merge(options))
17
+ end
18
+
19
+ def open
20
+ unless open?
21
+ @env = ::LMDB.new(path, lmdb_options)
22
+ @db = @env.database create: true
23
+ end
24
+
25
+ self
26
+ ensure
27
+ populate! if empty?
28
+ end
29
+
30
+ def close
31
+ env.close if open?
32
+ end
33
+
34
+ def open?
35
+ !db.nil?
36
+ end
37
+
38
+ def empty?
39
+ open? and db.size == 0
40
+ end
41
+
42
+ def truncate
43
+ close
44
+ %w{ data.mdb lock.mdb }.each do |mdb|
45
+ mdb = File.join(path, mdb)
46
+ File.unlink(mdb) if File.exists?(mdb)
47
+ end
48
+ end
49
+
50
+ def get(key)
51
+ db[key.to_s].to_i
52
+ end
53
+
54
+ def put(key, value)
55
+ db[key.to_s] = value.to_i.to_s
56
+ end
57
+
58
+ def path
59
+ options[:path]
60
+ end
61
+
62
+ def lmdb_options
63
+ options.reject { |k| [:path, :source].include?(k) }
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,27 @@
1
+ module AnyStyle
2
+ class Dictionary
3
+ class Marshal < Dictionary
4
+ @defaults = {
5
+ path: File.expand_path('../../data/dict.marshal', __FILE__)
6
+ }
7
+
8
+ def initialize(options = {})
9
+ super(self.class.defaults.merge(options))
10
+ end
11
+
12
+ def open
13
+ if File.exists?(options[:path])
14
+ @db = ::Marshal.load(File.open(options[:path]))
15
+ else
16
+ @db = {}
17
+ end
18
+ self
19
+ ensure
20
+ if empty?
21
+ populate!
22
+ ::Marshal.dump(db, File.open(options[:path], 'wb'))
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,55 @@
1
+ module AnyStyle
2
+ require 'redis'
3
+ maybe_require 'redis/namespace'
4
+
5
+ class Dictionary
6
+ class Redis < Dictionary
7
+ @defaults = {
8
+ namespace: 'anystyle',
9
+ port: 6379
10
+ }
11
+
12
+ def initialize(options = {})
13
+ super(self.class.defaults.merge(options))
14
+ end
15
+
16
+ def open
17
+ unless open?
18
+ @db = ::Redis.new(options)
19
+
20
+ unless namespace.nil? or not defined?(::Redis::Namespace)
21
+ @db = ::Redis::Namespace.new namespace, redis: @db
22
+ end
23
+ end
24
+
25
+ self
26
+ ensure
27
+ populate! if empty?
28
+ end
29
+
30
+ def close
31
+ db.close
32
+ end
33
+
34
+ def open?
35
+ not db.nil?
36
+ end
37
+
38
+ def empty?
39
+ open? and db.dbsize == 0
40
+ end
41
+
42
+ def get(key)
43
+ db[key.to_s].to_i
44
+ end
45
+
46
+ def put(key, value)
47
+ db[key.to_s] = value.to_i
48
+ end
49
+
50
+ def namespace
51
+ options[:namespace]
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,264 @@
1
+ module AnyStyle
2
+ class Document < Wapiti::Sequence
3
+ class << self
4
+ include PdfUtils
5
+
6
+ def parse(string, delimiter: /\r?\n/, tagged: false)
7
+ current_label = ''
8
+ new(string.split(delimiter).map { |line|
9
+ if tagged
10
+ label, line = line.split(/\s*\| /, 2)
11
+ current_label = label unless label.empty?
12
+ end
13
+ Wapiti::Token.new line, label: current_label.to_s
14
+ })
15
+ end
16
+
17
+ def open(path, format: File.extname(path), tagged: false, layout: true, **opts)
18
+ raise ArgumentError,
19
+ "cannot open tainted path: '#{path}'" if path.tainted?
20
+ raise ArgumentError,
21
+ "document not found: '#{path}'" unless File.exist?(path)
22
+
23
+ path = File.absolute_path(path)
24
+
25
+ case format.downcase
26
+ when '.pdf'
27
+ meta = pdf_meta path if opts[:parse_meta]
28
+ info = pdf_info path if opts[:parse_info]
29
+ input = pdf_to_text path, layout: layout
30
+ when '.ttx'
31
+ tagged = true
32
+ input = File.read(path, encoding: 'utf-8')
33
+ when '.txt'
34
+ input = File.read(path, encoding: 'utf-8')
35
+ end
36
+
37
+ doc = parse input, tagged: tagged
38
+ doc.path = path
39
+ doc.meta = meta
40
+ doc.info = info
41
+ doc
42
+ end
43
+ end
44
+
45
+ include StringUtils
46
+
47
+ attr_accessor :meta, :info, :path, :pages, :tokens
48
+ alias_method :lines, :tokens
49
+
50
+ def pages
51
+ @pages ||= Page.parse(lines)
52
+ end
53
+
54
+ def each
55
+ if block_given?
56
+ pages.each.with_index do |page, pn|
57
+ page.lines.each.with_index do |line, ln|
58
+ yield line, ln, page, pn
59
+ end
60
+ end
61
+ self
62
+ else
63
+ to_enum
64
+ end
65
+ end
66
+
67
+ def each_section
68
+ if block_given?
69
+ current = []
70
+ lines.each do |ln|
71
+ case ln.label
72
+ when 'title'
73
+ unless current.empty?
74
+ yield current
75
+ current = []
76
+ end
77
+ when 'ref', 'text'
78
+ current << ln
79
+ else
80
+ # ignore
81
+ end
82
+ end
83
+ unless current.empty?
84
+ yield current
85
+ end
86
+ self
87
+ else
88
+ to_enum
89
+ end
90
+ end
91
+
92
+ def label(other)
93
+ doc = dup
94
+ doc.tokens = lines.map.with_index { |line, idx|
95
+ Wapiti::Token.new line.value,
96
+ label: other[idx].label.to_s,
97
+ observations: other[idx].observations.dup
98
+ }
99
+ doc
100
+ end
101
+
102
+ def to_s(delimiter: "\n", encode: false, tagged: false, **opts)
103
+ if tagged
104
+ prev_label = nil
105
+ lines.map { |ln|
106
+ label = (ln.label == prev_label) ? '' : ln.label
107
+ prev_label = ln.label
108
+ '%.14s| %s' % ["#{label} ", ln.value]
109
+ }.join(delimiter)
110
+ else
111
+ super(delimiter: delimiter, encode: encode, tagged: tagged, expanded: false, **opts)
112
+ end
113
+ end
114
+
115
+ def to_a(encode: true, **opts)
116
+ super(encode: encode, **opts)
117
+ end
118
+
119
+ def to_h(**opts)
120
+ {
121
+ info: info,
122
+ meta: meta,
123
+ sections: sections(**opts),
124
+ title: title(**opts),
125
+ references: references(**opts)
126
+ }
127
+ end
128
+
129
+ def references(**opts)
130
+ bib, current, delta, indent = [], nil, 0, 0
131
+
132
+ lines.each do |ln|
133
+ case ln.label
134
+ when 'ref'
135
+ val = display_chars(ln.value).rstrip
136
+ idt = val[/^\s*/].length
137
+ val.lstrip!
138
+
139
+ if current.nil?
140
+ current, delta, indent = val, 0, idt
141
+ else
142
+ if join_refs?(current, val, delta, idt - indent)
143
+ current = join_refs(current, val)
144
+ else
145
+ bib << current
146
+ current, delta, indent = val, 0, idt
147
+ end
148
+ end
149
+ else
150
+ unless current.nil?
151
+ if delta > 15 || %w{ blank meta }.include?(ln.label)
152
+ delta += 1
153
+ else
154
+ bib << current
155
+ current, delta, indent = nil, 0, idt
156
+ end
157
+ end
158
+ end
159
+ end
160
+
161
+ unless current.nil?
162
+ bib << current
163
+ end
164
+
165
+ bib
166
+ end
167
+
168
+ def join_refs?(a, b, delta = 0, indent = 0)
169
+ pro = [
170
+ indent > 0,
171
+ delta == 0,
172
+ b.length < 50,
173
+ a.length < 65,
174
+ a.match?(/[,;:&\p{Pd}]$/),
175
+ b.match?(/^\p{Ll}/) || a.match?(/\p{L}$/) && b.match?(/^\p{L}/)
176
+ ].count(true)
177
+
178
+ con = [
179
+ indent < 0,
180
+ delta > 8,
181
+ a.match?(/\.\]$/),
182
+ a.length > 500,
183
+ (b.length - a.length) > 12,
184
+ b.match?(/^(\p{Pd}\p{Pd}|\p{Lu}\p{Ll}+, \p{Lu}\.|\[\d)/)
185
+ ].count(true)
186
+
187
+ (pro - con) > 1
188
+ end
189
+
190
+ def join_refs(a, b)
191
+ if a[-1] == '-'
192
+ if b =~ /^\p{Ll}/
193
+ "#{a[0...-1]}#{b}"
194
+ else
195
+ "#{a}#{b}"
196
+ end
197
+ else
198
+ "#{a} #{b}"
199
+ end
200
+ end
201
+
202
+ def sections(delimiter: "\n", **opts)
203
+ []
204
+ end
205
+
206
+ def title(delimiter: " ", **opts)
207
+ lines.drop_while { |ln|
208
+ ln.label != 'title'
209
+ }.take_while { |ln|
210
+ ln.label == 'title'
211
+ }.map(&:value).join(delimiter)
212
+ end
213
+
214
+ def inspect
215
+ "#<AnyStyle::Document lines={#{size}}>"
216
+ end
217
+
218
+
219
+ class Page
220
+ extend StringUtils
221
+
222
+ class << self
223
+ def parse(lines)
224
+ pages, current, width = [], [], 0
225
+
226
+ lines.each do |line|
227
+ if page_break?(line.value)
228
+ unless current.empty?
229
+ pages << new(current, width: width)
230
+ end
231
+
232
+ current = [line]
233
+ width = display_width(line.value)
234
+ else
235
+ current << line
236
+ width = [width, display_width(line.value)].max
237
+ end
238
+ end
239
+
240
+ unless current.empty?
241
+ pages << new(current, width: width)
242
+ end
243
+
244
+ pages
245
+ end
246
+ end
247
+
248
+ attr_accessor :lines, :width
249
+
250
+ def initialize(lines = [], width: 0)
251
+ @lines = lines
252
+ @width = width
253
+ end
254
+
255
+ def size
256
+ lines.size
257
+ end
258
+
259
+ def inspect
260
+ "#<AnyStyle::Document::Page size={#{size}} width={#{width}}>"
261
+ end
262
+ end
263
+ end
264
+ end
@@ -0,0 +1,14 @@
1
+ module AnyStyle
2
+ class Error < StandardError
3
+
4
+ attr_accessor :original
5
+
6
+ def initialize(message = nil, original = $!)
7
+ super(message)
8
+ @original = original
9
+ end
10
+
11
+ end
12
+
13
+ class TrainingError < Error; end
14
+ end
@@ -0,0 +1,27 @@
1
+ module AnyStyle
2
+ class Feature
3
+ include StringUtils
4
+
5
+ attr_reader :precision
6
+
7
+ def initialize(precision: 10, **opts)
8
+ @precision = precision
9
+ end
10
+
11
+ def observe(token, **opts)
12
+ raise NotImplementedError
13
+ end
14
+
15
+ def next(idx, seq)
16
+ sequence[idx + 1]
17
+ end
18
+
19
+ def prev(idx, seq)
20
+ idx == 0 ? nil : seq[idx - 1]
21
+ end
22
+
23
+ def ratio(x, y)
24
+ (y > 0) ? ((x.to_f / y) * precision).round : 0
25
+ end
26
+ end
27
+ end