anystyle 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/HISTORY.md +78 -0
- data/LICENSE +27 -0
- data/README.md +103 -0
- data/lib/anystyle.rb +71 -0
- data/lib/anystyle/dictionary.rb +132 -0
- data/lib/anystyle/dictionary/gdbm.rb +52 -0
- data/lib/anystyle/dictionary/lmdb.rb +67 -0
- data/lib/anystyle/dictionary/marshal.rb +27 -0
- data/lib/anystyle/dictionary/redis.rb +55 -0
- data/lib/anystyle/document.rb +264 -0
- data/lib/anystyle/errors.rb +14 -0
- data/lib/anystyle/feature.rb +27 -0
- data/lib/anystyle/feature/affix.rb +43 -0
- data/lib/anystyle/feature/brackets.rb +32 -0
- data/lib/anystyle/feature/canonical.rb +13 -0
- data/lib/anystyle/feature/caps.rb +20 -0
- data/lib/anystyle/feature/category.rb +70 -0
- data/lib/anystyle/feature/dictionary.rb +16 -0
- data/lib/anystyle/feature/indent.rb +16 -0
- data/lib/anystyle/feature/keyword.rb +52 -0
- data/lib/anystyle/feature/line.rb +39 -0
- data/lib/anystyle/feature/locator.rb +18 -0
- data/lib/anystyle/feature/number.rb +39 -0
- data/lib/anystyle/feature/position.rb +28 -0
- data/lib/anystyle/feature/punctuation.rb +22 -0
- data/lib/anystyle/feature/quotes.rb +20 -0
- data/lib/anystyle/feature/ref.rb +21 -0
- data/lib/anystyle/feature/terminal.rb +19 -0
- data/lib/anystyle/feature/words.rb +74 -0
- data/lib/anystyle/finder.rb +94 -0
- data/lib/anystyle/format/bibtex.rb +63 -0
- data/lib/anystyle/format/csl.rb +28 -0
- data/lib/anystyle/normalizer.rb +65 -0
- data/lib/anystyle/normalizer/brackets.rb +13 -0
- data/lib/anystyle/normalizer/container.rb +13 -0
- data/lib/anystyle/normalizer/date.rb +109 -0
- data/lib/anystyle/normalizer/edition.rb +16 -0
- data/lib/anystyle/normalizer/journal.rb +14 -0
- data/lib/anystyle/normalizer/locale.rb +30 -0
- data/lib/anystyle/normalizer/location.rb +24 -0
- data/lib/anystyle/normalizer/locator.rb +22 -0
- data/lib/anystyle/normalizer/names.rb +88 -0
- data/lib/anystyle/normalizer/page.rb +29 -0
- data/lib/anystyle/normalizer/publisher.rb +18 -0
- data/lib/anystyle/normalizer/pubmed.rb +18 -0
- data/lib/anystyle/normalizer/punctuation.rb +23 -0
- data/lib/anystyle/normalizer/quotes.rb +14 -0
- data/lib/anystyle/normalizer/type.rb +54 -0
- data/lib/anystyle/normalizer/volume.rb +26 -0
- data/lib/anystyle/parser.rb +199 -0
- data/lib/anystyle/support.rb +4 -0
- data/lib/anystyle/support/finder.mod +3234 -0
- data/lib/anystyle/support/finder.txt +75 -0
- data/lib/anystyle/support/parser.mod +15025 -0
- data/lib/anystyle/support/parser.txt +75 -0
- data/lib/anystyle/utils.rb +70 -0
- data/lib/anystyle/version.rb +3 -0
- data/res/finder/bb132pr2055.ttx +6803 -0
- data/res/finder/bb550sh8053.ttx +18660 -0
- data/res/finder/bb599nz4341.ttx +2957 -0
- data/res/finder/bb725rt6501.ttx +15276 -0
- data/res/finder/bc605xz1554.ttx +18815 -0
- data/res/finder/bd040gx5718.ttx +4271 -0
- data/res/finder/bd413nt2715.ttx +4956 -0
- data/res/finder/bd466fq0394.ttx +6100 -0
- data/res/finder/bf668vw2021.ttx +3578 -0
- data/res/finder/bg495cx0468.ttx +7267 -0
- data/res/finder/bg599vt3743.ttx +6752 -0
- data/res/finder/bg608dx2253.ttx +4094 -0
- data/res/finder/bh410qk3771.ttx +8785 -0
- data/res/finder/bh989ww6442.ttx +17204 -0
- data/res/finder/bj581pc8202.ttx +2719 -0
- data/res/parser/bad.xml +5199 -0
- data/res/parser/core.xml +7924 -0
- data/res/parser/gold.xml +2707 -0
- data/res/parser/good.xml +34281 -0
- data/res/parser/stanford-books.xml +2280 -0
- data/res/parser/stanford-diss.xml +726 -0
- data/res/parser/stanford-theses.xml +4684 -0
- data/res/parser/ugly.xml +33246 -0
- metadata +195 -0
@@ -0,0 +1,67 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
require 'lmdb'
|
3
|
+
|
4
|
+
class Dictionary
|
5
|
+
class LMDB < Dictionary
|
6
|
+
@defaults = {
|
7
|
+
path: File.expand_path('../data', __FILE__),
|
8
|
+
mapsize: 1 << 22,
|
9
|
+
writemap: true,
|
10
|
+
mapasync: true
|
11
|
+
}
|
12
|
+
|
13
|
+
attr_reader :env
|
14
|
+
|
15
|
+
def initialize(options = {})
|
16
|
+
super(self.class.defaults.merge(options))
|
17
|
+
end
|
18
|
+
|
19
|
+
def open
|
20
|
+
unless open?
|
21
|
+
@env = ::LMDB.new(path, lmdb_options)
|
22
|
+
@db = @env.database create: true
|
23
|
+
end
|
24
|
+
|
25
|
+
self
|
26
|
+
ensure
|
27
|
+
populate! if empty?
|
28
|
+
end
|
29
|
+
|
30
|
+
def close
|
31
|
+
env.close if open?
|
32
|
+
end
|
33
|
+
|
34
|
+
def open?
|
35
|
+
!db.nil?
|
36
|
+
end
|
37
|
+
|
38
|
+
def empty?
|
39
|
+
open? and db.size == 0
|
40
|
+
end
|
41
|
+
|
42
|
+
def truncate
|
43
|
+
close
|
44
|
+
%w{ data.mdb lock.mdb }.each do |mdb|
|
45
|
+
mdb = File.join(path, mdb)
|
46
|
+
File.unlink(mdb) if File.exists?(mdb)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def get(key)
|
51
|
+
db[key.to_s].to_i
|
52
|
+
end
|
53
|
+
|
54
|
+
def put(key, value)
|
55
|
+
db[key.to_s] = value.to_i.to_s
|
56
|
+
end
|
57
|
+
|
58
|
+
def path
|
59
|
+
options[:path]
|
60
|
+
end
|
61
|
+
|
62
|
+
def lmdb_options
|
63
|
+
options.reject { |k| [:path, :source].include?(k) }
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Dictionary
|
3
|
+
class Marshal < Dictionary
|
4
|
+
@defaults = {
|
5
|
+
path: File.expand_path('../../data/dict.marshal', __FILE__)
|
6
|
+
}
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
super(self.class.defaults.merge(options))
|
10
|
+
end
|
11
|
+
|
12
|
+
def open
|
13
|
+
if File.exists?(options[:path])
|
14
|
+
@db = ::Marshal.load(File.open(options[:path]))
|
15
|
+
else
|
16
|
+
@db = {}
|
17
|
+
end
|
18
|
+
self
|
19
|
+
ensure
|
20
|
+
if empty?
|
21
|
+
populate!
|
22
|
+
::Marshal.dump(db, File.open(options[:path], 'wb'))
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
require 'redis'
|
3
|
+
maybe_require 'redis/namespace'
|
4
|
+
|
5
|
+
class Dictionary
|
6
|
+
class Redis < Dictionary
|
7
|
+
@defaults = {
|
8
|
+
namespace: 'anystyle',
|
9
|
+
port: 6379
|
10
|
+
}
|
11
|
+
|
12
|
+
def initialize(options = {})
|
13
|
+
super(self.class.defaults.merge(options))
|
14
|
+
end
|
15
|
+
|
16
|
+
def open
|
17
|
+
unless open?
|
18
|
+
@db = ::Redis.new(options)
|
19
|
+
|
20
|
+
unless namespace.nil? or not defined?(::Redis::Namespace)
|
21
|
+
@db = ::Redis::Namespace.new namespace, redis: @db
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
self
|
26
|
+
ensure
|
27
|
+
populate! if empty?
|
28
|
+
end
|
29
|
+
|
30
|
+
def close
|
31
|
+
db.close
|
32
|
+
end
|
33
|
+
|
34
|
+
def open?
|
35
|
+
not db.nil?
|
36
|
+
end
|
37
|
+
|
38
|
+
def empty?
|
39
|
+
open? and db.dbsize == 0
|
40
|
+
end
|
41
|
+
|
42
|
+
def get(key)
|
43
|
+
db[key.to_s].to_i
|
44
|
+
end
|
45
|
+
|
46
|
+
def put(key, value)
|
47
|
+
db[key.to_s] = value.to_i
|
48
|
+
end
|
49
|
+
|
50
|
+
def namespace
|
51
|
+
options[:namespace]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,264 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Document < Wapiti::Sequence
|
3
|
+
class << self
|
4
|
+
include PdfUtils
|
5
|
+
|
6
|
+
def parse(string, delimiter: /\r?\n/, tagged: false)
|
7
|
+
current_label = ''
|
8
|
+
new(string.split(delimiter).map { |line|
|
9
|
+
if tagged
|
10
|
+
label, line = line.split(/\s*\| /, 2)
|
11
|
+
current_label = label unless label.empty?
|
12
|
+
end
|
13
|
+
Wapiti::Token.new line, label: current_label.to_s
|
14
|
+
})
|
15
|
+
end
|
16
|
+
|
17
|
+
def open(path, format: File.extname(path), tagged: false, layout: true, **opts)
|
18
|
+
raise ArgumentError,
|
19
|
+
"cannot open tainted path: '#{path}'" if path.tainted?
|
20
|
+
raise ArgumentError,
|
21
|
+
"document not found: '#{path}'" unless File.exist?(path)
|
22
|
+
|
23
|
+
path = File.absolute_path(path)
|
24
|
+
|
25
|
+
case format.downcase
|
26
|
+
when '.pdf'
|
27
|
+
meta = pdf_meta path if opts[:parse_meta]
|
28
|
+
info = pdf_info path if opts[:parse_info]
|
29
|
+
input = pdf_to_text path, layout: layout
|
30
|
+
when '.ttx'
|
31
|
+
tagged = true
|
32
|
+
input = File.read(path, encoding: 'utf-8')
|
33
|
+
when '.txt'
|
34
|
+
input = File.read(path, encoding: 'utf-8')
|
35
|
+
end
|
36
|
+
|
37
|
+
doc = parse input, tagged: tagged
|
38
|
+
doc.path = path
|
39
|
+
doc.meta = meta
|
40
|
+
doc.info = info
|
41
|
+
doc
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
include StringUtils
|
46
|
+
|
47
|
+
attr_accessor :meta, :info, :path, :pages, :tokens
|
48
|
+
alias_method :lines, :tokens
|
49
|
+
|
50
|
+
def pages
|
51
|
+
@pages ||= Page.parse(lines)
|
52
|
+
end
|
53
|
+
|
54
|
+
def each
|
55
|
+
if block_given?
|
56
|
+
pages.each.with_index do |page, pn|
|
57
|
+
page.lines.each.with_index do |line, ln|
|
58
|
+
yield line, ln, page, pn
|
59
|
+
end
|
60
|
+
end
|
61
|
+
self
|
62
|
+
else
|
63
|
+
to_enum
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def each_section
|
68
|
+
if block_given?
|
69
|
+
current = []
|
70
|
+
lines.each do |ln|
|
71
|
+
case ln.label
|
72
|
+
when 'title'
|
73
|
+
unless current.empty?
|
74
|
+
yield current
|
75
|
+
current = []
|
76
|
+
end
|
77
|
+
when 'ref', 'text'
|
78
|
+
current << ln
|
79
|
+
else
|
80
|
+
# ignore
|
81
|
+
end
|
82
|
+
end
|
83
|
+
unless current.empty?
|
84
|
+
yield current
|
85
|
+
end
|
86
|
+
self
|
87
|
+
else
|
88
|
+
to_enum
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def label(other)
|
93
|
+
doc = dup
|
94
|
+
doc.tokens = lines.map.with_index { |line, idx|
|
95
|
+
Wapiti::Token.new line.value,
|
96
|
+
label: other[idx].label.to_s,
|
97
|
+
observations: other[idx].observations.dup
|
98
|
+
}
|
99
|
+
doc
|
100
|
+
end
|
101
|
+
|
102
|
+
def to_s(delimiter: "\n", encode: false, tagged: false, **opts)
|
103
|
+
if tagged
|
104
|
+
prev_label = nil
|
105
|
+
lines.map { |ln|
|
106
|
+
label = (ln.label == prev_label) ? '' : ln.label
|
107
|
+
prev_label = ln.label
|
108
|
+
'%.14s| %s' % ["#{label} ", ln.value]
|
109
|
+
}.join(delimiter)
|
110
|
+
else
|
111
|
+
super(delimiter: delimiter, encode: encode, tagged: tagged, expanded: false, **opts)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def to_a(encode: true, **opts)
|
116
|
+
super(encode: encode, **opts)
|
117
|
+
end
|
118
|
+
|
119
|
+
def to_h(**opts)
|
120
|
+
{
|
121
|
+
info: info,
|
122
|
+
meta: meta,
|
123
|
+
sections: sections(**opts),
|
124
|
+
title: title(**opts),
|
125
|
+
references: references(**opts)
|
126
|
+
}
|
127
|
+
end
|
128
|
+
|
129
|
+
def references(**opts)
|
130
|
+
bib, current, delta, indent = [], nil, 0, 0
|
131
|
+
|
132
|
+
lines.each do |ln|
|
133
|
+
case ln.label
|
134
|
+
when 'ref'
|
135
|
+
val = display_chars(ln.value).rstrip
|
136
|
+
idt = val[/^\s*/].length
|
137
|
+
val.lstrip!
|
138
|
+
|
139
|
+
if current.nil?
|
140
|
+
current, delta, indent = val, 0, idt
|
141
|
+
else
|
142
|
+
if join_refs?(current, val, delta, idt - indent)
|
143
|
+
current = join_refs(current, val)
|
144
|
+
else
|
145
|
+
bib << current
|
146
|
+
current, delta, indent = val, 0, idt
|
147
|
+
end
|
148
|
+
end
|
149
|
+
else
|
150
|
+
unless current.nil?
|
151
|
+
if delta > 15 || %w{ blank meta }.include?(ln.label)
|
152
|
+
delta += 1
|
153
|
+
else
|
154
|
+
bib << current
|
155
|
+
current, delta, indent = nil, 0, idt
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
unless current.nil?
|
162
|
+
bib << current
|
163
|
+
end
|
164
|
+
|
165
|
+
bib
|
166
|
+
end
|
167
|
+
|
168
|
+
def join_refs?(a, b, delta = 0, indent = 0)
|
169
|
+
pro = [
|
170
|
+
indent > 0,
|
171
|
+
delta == 0,
|
172
|
+
b.length < 50,
|
173
|
+
a.length < 65,
|
174
|
+
a.match?(/[,;:&\p{Pd}]$/),
|
175
|
+
b.match?(/^\p{Ll}/) || a.match?(/\p{L}$/) && b.match?(/^\p{L}/)
|
176
|
+
].count(true)
|
177
|
+
|
178
|
+
con = [
|
179
|
+
indent < 0,
|
180
|
+
delta > 8,
|
181
|
+
a.match?(/\.\]$/),
|
182
|
+
a.length > 500,
|
183
|
+
(b.length - a.length) > 12,
|
184
|
+
b.match?(/^(\p{Pd}\p{Pd}|\p{Lu}\p{Ll}+, \p{Lu}\.|\[\d)/)
|
185
|
+
].count(true)
|
186
|
+
|
187
|
+
(pro - con) > 1
|
188
|
+
end
|
189
|
+
|
190
|
+
def join_refs(a, b)
|
191
|
+
if a[-1] == '-'
|
192
|
+
if b =~ /^\p{Ll}/
|
193
|
+
"#{a[0...-1]}#{b}"
|
194
|
+
else
|
195
|
+
"#{a}#{b}"
|
196
|
+
end
|
197
|
+
else
|
198
|
+
"#{a} #{b}"
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def sections(delimiter: "\n", **opts)
|
203
|
+
[]
|
204
|
+
end
|
205
|
+
|
206
|
+
def title(delimiter: " ", **opts)
|
207
|
+
lines.drop_while { |ln|
|
208
|
+
ln.label != 'title'
|
209
|
+
}.take_while { |ln|
|
210
|
+
ln.label == 'title'
|
211
|
+
}.map(&:value).join(delimiter)
|
212
|
+
end
|
213
|
+
|
214
|
+
def inspect
|
215
|
+
"#<AnyStyle::Document lines={#{size}}>"
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
class Page
|
220
|
+
extend StringUtils
|
221
|
+
|
222
|
+
class << self
|
223
|
+
def parse(lines)
|
224
|
+
pages, current, width = [], [], 0
|
225
|
+
|
226
|
+
lines.each do |line|
|
227
|
+
if page_break?(line.value)
|
228
|
+
unless current.empty?
|
229
|
+
pages << new(current, width: width)
|
230
|
+
end
|
231
|
+
|
232
|
+
current = [line]
|
233
|
+
width = display_width(line.value)
|
234
|
+
else
|
235
|
+
current << line
|
236
|
+
width = [width, display_width(line.value)].max
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
unless current.empty?
|
241
|
+
pages << new(current, width: width)
|
242
|
+
end
|
243
|
+
|
244
|
+
pages
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
attr_accessor :lines, :width
|
249
|
+
|
250
|
+
def initialize(lines = [], width: 0)
|
251
|
+
@lines = lines
|
252
|
+
@width = width
|
253
|
+
end
|
254
|
+
|
255
|
+
def size
|
256
|
+
lines.size
|
257
|
+
end
|
258
|
+
|
259
|
+
def inspect
|
260
|
+
"#<AnyStyle::Document::Page size={#{size}} width={#{width}}>"
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end
|
264
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
include StringUtils
|
4
|
+
|
5
|
+
attr_reader :precision
|
6
|
+
|
7
|
+
def initialize(precision: 10, **opts)
|
8
|
+
@precision = precision
|
9
|
+
end
|
10
|
+
|
11
|
+
def observe(token, **opts)
|
12
|
+
raise NotImplementedError
|
13
|
+
end
|
14
|
+
|
15
|
+
def next(idx, seq)
|
16
|
+
sequence[idx + 1]
|
17
|
+
end
|
18
|
+
|
19
|
+
def prev(idx, seq)
|
20
|
+
idx == 0 ? nil : seq[idx - 1]
|
21
|
+
end
|
22
|
+
|
23
|
+
def ratio(x, y)
|
24
|
+
(y > 0) ? ((x.to_f / y) * precision).round : 0
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|