anystyle 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/HISTORY.md +78 -0
- data/LICENSE +27 -0
- data/README.md +103 -0
- data/lib/anystyle.rb +71 -0
- data/lib/anystyle/dictionary.rb +132 -0
- data/lib/anystyle/dictionary/gdbm.rb +52 -0
- data/lib/anystyle/dictionary/lmdb.rb +67 -0
- data/lib/anystyle/dictionary/marshal.rb +27 -0
- data/lib/anystyle/dictionary/redis.rb +55 -0
- data/lib/anystyle/document.rb +264 -0
- data/lib/anystyle/errors.rb +14 -0
- data/lib/anystyle/feature.rb +27 -0
- data/lib/anystyle/feature/affix.rb +43 -0
- data/lib/anystyle/feature/brackets.rb +32 -0
- data/lib/anystyle/feature/canonical.rb +13 -0
- data/lib/anystyle/feature/caps.rb +20 -0
- data/lib/anystyle/feature/category.rb +70 -0
- data/lib/anystyle/feature/dictionary.rb +16 -0
- data/lib/anystyle/feature/indent.rb +16 -0
- data/lib/anystyle/feature/keyword.rb +52 -0
- data/lib/anystyle/feature/line.rb +39 -0
- data/lib/anystyle/feature/locator.rb +18 -0
- data/lib/anystyle/feature/number.rb +39 -0
- data/lib/anystyle/feature/position.rb +28 -0
- data/lib/anystyle/feature/punctuation.rb +22 -0
- data/lib/anystyle/feature/quotes.rb +20 -0
- data/lib/anystyle/feature/ref.rb +21 -0
- data/lib/anystyle/feature/terminal.rb +19 -0
- data/lib/anystyle/feature/words.rb +74 -0
- data/lib/anystyle/finder.rb +94 -0
- data/lib/anystyle/format/bibtex.rb +63 -0
- data/lib/anystyle/format/csl.rb +28 -0
- data/lib/anystyle/normalizer.rb +65 -0
- data/lib/anystyle/normalizer/brackets.rb +13 -0
- data/lib/anystyle/normalizer/container.rb +13 -0
- data/lib/anystyle/normalizer/date.rb +109 -0
- data/lib/anystyle/normalizer/edition.rb +16 -0
- data/lib/anystyle/normalizer/journal.rb +14 -0
- data/lib/anystyle/normalizer/locale.rb +30 -0
- data/lib/anystyle/normalizer/location.rb +24 -0
- data/lib/anystyle/normalizer/locator.rb +22 -0
- data/lib/anystyle/normalizer/names.rb +88 -0
- data/lib/anystyle/normalizer/page.rb +29 -0
- data/lib/anystyle/normalizer/publisher.rb +18 -0
- data/lib/anystyle/normalizer/pubmed.rb +18 -0
- data/lib/anystyle/normalizer/punctuation.rb +23 -0
- data/lib/anystyle/normalizer/quotes.rb +14 -0
- data/lib/anystyle/normalizer/type.rb +54 -0
- data/lib/anystyle/normalizer/volume.rb +26 -0
- data/lib/anystyle/parser.rb +199 -0
- data/lib/anystyle/support.rb +4 -0
- data/lib/anystyle/support/finder.mod +3234 -0
- data/lib/anystyle/support/finder.txt +75 -0
- data/lib/anystyle/support/parser.mod +15025 -0
- data/lib/anystyle/support/parser.txt +75 -0
- data/lib/anystyle/utils.rb +70 -0
- data/lib/anystyle/version.rb +3 -0
- data/res/finder/bb132pr2055.ttx +6803 -0
- data/res/finder/bb550sh8053.ttx +18660 -0
- data/res/finder/bb599nz4341.ttx +2957 -0
- data/res/finder/bb725rt6501.ttx +15276 -0
- data/res/finder/bc605xz1554.ttx +18815 -0
- data/res/finder/bd040gx5718.ttx +4271 -0
- data/res/finder/bd413nt2715.ttx +4956 -0
- data/res/finder/bd466fq0394.ttx +6100 -0
- data/res/finder/bf668vw2021.ttx +3578 -0
- data/res/finder/bg495cx0468.ttx +7267 -0
- data/res/finder/bg599vt3743.ttx +6752 -0
- data/res/finder/bg608dx2253.ttx +4094 -0
- data/res/finder/bh410qk3771.ttx +8785 -0
- data/res/finder/bh989ww6442.ttx +17204 -0
- data/res/finder/bj581pc8202.ttx +2719 -0
- data/res/parser/bad.xml +5199 -0
- data/res/parser/core.xml +7924 -0
- data/res/parser/gold.xml +2707 -0
- data/res/parser/good.xml +34281 -0
- data/res/parser/stanford-books.xml +2280 -0
- data/res/parser/stanford-diss.xml +726 -0
- data/res/parser/stanford-theses.xml +4684 -0
- data/res/parser/ugly.xml +33246 -0
- metadata +195 -0
@@ -0,0 +1,43 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Affix < Feature
|
4
|
+
attr_reader :size
|
5
|
+
|
6
|
+
def initialize(size: 4, prefix: true, suffix: false)
|
7
|
+
@size, @suffix = size, (suffix || !prefix)
|
8
|
+
end
|
9
|
+
|
10
|
+
def observe(token, **opts)
|
11
|
+
build(extract(token)) { |chars| join(chars) }
|
12
|
+
end
|
13
|
+
|
14
|
+
def extract(token)
|
15
|
+
if suffix?
|
16
|
+
token.chars.reverse.take(size)
|
17
|
+
else
|
18
|
+
token.chars.take(size)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def join(chars)
|
23
|
+
if suffix?
|
24
|
+
chars.reverse.join('')
|
25
|
+
else
|
26
|
+
chars.join('')
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def build(chars)
|
31
|
+
(1..size).map { |n| yield chars.take(n) }
|
32
|
+
end
|
33
|
+
|
34
|
+
def suffix?
|
35
|
+
!!@suffix
|
36
|
+
end
|
37
|
+
|
38
|
+
def prefix?
|
39
|
+
!suffix?
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Brackets < Feature
|
4
|
+
def observe(token, **opts)
|
5
|
+
case token
|
6
|
+
when /^[^\(\[<>\)\]]+$/
|
7
|
+
:none
|
8
|
+
when /^\(.*\)[,;:\p{Pd}\.]?$/
|
9
|
+
:parens
|
10
|
+
when /^\[.*\][,;:\p{Pd}\.]?$/
|
11
|
+
:'square-brackets'
|
12
|
+
when /^<.*>[,;:\p{Pd}\.]?$/
|
13
|
+
:angle
|
14
|
+
when /\)[,;:\p{Pd}\.]?$/
|
15
|
+
:'closing-paren'
|
16
|
+
when /^\(/
|
17
|
+
:'opening-paren'
|
18
|
+
when /\][,;:\p{Pd}\.]?$/
|
19
|
+
:'closing-square-bracket'
|
20
|
+
when /^\[/
|
21
|
+
:'opening-square-bracket'
|
22
|
+
when />[,;:\p{Pd}\.]?$/
|
23
|
+
:'closing-angle'
|
24
|
+
when /^</
|
25
|
+
:'opening-angle'
|
26
|
+
else
|
27
|
+
:other
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Caps < Feature
|
4
|
+
def observe(_, alpha:, **opts)
|
5
|
+
case alpha
|
6
|
+
when /^\p{Upper}$/
|
7
|
+
:single
|
8
|
+
when /^\p{Upper}\p{Lower}/
|
9
|
+
:initial
|
10
|
+
when /^\p{Upper}+$/
|
11
|
+
:caps
|
12
|
+
when /^\p{Lower}+$/
|
13
|
+
:lower
|
14
|
+
else
|
15
|
+
:other
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Category < Feature
|
4
|
+
attr_reader :index
|
5
|
+
|
6
|
+
def initialize(index: [0, -1], strip: false)
|
7
|
+
@index, @strip = index, !!strip
|
8
|
+
end
|
9
|
+
|
10
|
+
def observe(token, **opts)
|
11
|
+
chars(token).values_at(*index).map { |char| categorize char }
|
12
|
+
end
|
13
|
+
|
14
|
+
def chars(token)
|
15
|
+
if strip?
|
16
|
+
token.strip.chars
|
17
|
+
else
|
18
|
+
token.chars
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def categorize(char)
|
23
|
+
case char
|
24
|
+
when /\p{Lu}/
|
25
|
+
:Lu
|
26
|
+
when /\p{Ll}/
|
27
|
+
:Ll
|
28
|
+
when /\p{Lm}/
|
29
|
+
:Lm
|
30
|
+
when /\p{L}/
|
31
|
+
:L
|
32
|
+
when /\p{M}/
|
33
|
+
:M
|
34
|
+
when /\p{N}/
|
35
|
+
:N
|
36
|
+
when /\p{Pc}/
|
37
|
+
:Pc
|
38
|
+
when /\p{Pd}/
|
39
|
+
:Pd
|
40
|
+
when /\p{Ps}/
|
41
|
+
:Ps
|
42
|
+
when /\p{Pe}/
|
43
|
+
:Pe
|
44
|
+
when /\p{Pi}/
|
45
|
+
:Pi
|
46
|
+
when /\p{Pf}/
|
47
|
+
:Pf
|
48
|
+
when /\p{P}/
|
49
|
+
:P
|
50
|
+
when /\p{S}/
|
51
|
+
:S
|
52
|
+
when /\p{Zl}/
|
53
|
+
:Zl
|
54
|
+
when /\p{Zp}/
|
55
|
+
:Zp
|
56
|
+
when /\p{Z}/
|
57
|
+
:Z
|
58
|
+
when /\p{C}/
|
59
|
+
:C
|
60
|
+
else
|
61
|
+
:none
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def strip?
|
66
|
+
@strip
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Dictionary < Feature
|
4
|
+
attr_reader :dictionary
|
5
|
+
|
6
|
+
def initialize(dictionary:, **opts)
|
7
|
+
super(**opts)
|
8
|
+
@dictionary = dictionary
|
9
|
+
end
|
10
|
+
|
11
|
+
def observe(token, alpha:, **opts)
|
12
|
+
dictionary.tags(alpha.downcase)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Indent < Feature
|
4
|
+
def observe(token, seq:, idx:, **opts)
|
5
|
+
i = indent(token)
|
6
|
+
p = prev(idx, seq)
|
7
|
+
j = p.nil? ? 0 : indent(p.value)
|
8
|
+
|
9
|
+
[
|
10
|
+
(i > 0) ? 'T' : 'F',
|
11
|
+
(i < j) ? '-' : (i > j) ? '+' : '=',
|
12
|
+
]
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Keyword < Feature
|
4
|
+
def observe(token, alpha: token, **opts)
|
5
|
+
case token
|
6
|
+
when '&'
|
7
|
+
:and
|
8
|
+
else
|
9
|
+
case alpha
|
10
|
+
when /^ed(s|itors?|ited?|iteurs?)?$/i,
|
11
|
+
/^(hg|hrsg|herausgeber)$/i,
|
12
|
+
/^(compilador)$/i
|
13
|
+
:editor
|
14
|
+
when /^trans(l(ated|ators?|ation))?$/i,
|
15
|
+
/^übers(etz(t|ung))?$/i,
|
16
|
+
/^trad(uction|ucteurs?|uit)?$/i
|
17
|
+
:translator
|
18
|
+
when /^(dissertation|thesis)$/i
|
19
|
+
:thesis
|
20
|
+
when /^(proceedings|conference|meeting|transactions|communications|seminar|symposi(on|um))/i
|
21
|
+
:proceedings
|
22
|
+
when /^(Journal|Zeitschrift|Quarterly|Magazine?|Times|Rev(iew|vue)?|Bulletin|News|Week)/
|
23
|
+
:journal
|
24
|
+
when /^in$/i
|
25
|
+
:in
|
26
|
+
when /^([AaUu]nd|y|e)$/
|
27
|
+
:and
|
28
|
+
when /^(etal|others)$/
|
29
|
+
:etal
|
30
|
+
when /^(pp?|pages?|S(eiten?)?|ff?)$/
|
31
|
+
:page
|
32
|
+
when /^(vol(ume)?s?|iss(ue)?|n[or]?|number)$/i
|
33
|
+
:volume
|
34
|
+
when /^(edn|edition|expanded|rev(ised)?|p?reprint(ed)?|illustrated)$/i,
|
35
|
+
/^(aufl(age)?|\p{Alpha}*ausg(abe)?)$/i
|
36
|
+
:edition
|
37
|
+
when /^(nd|date|spring|s[uo]mmer|autumn|fall|winter|frühling|herbst)$/i,
|
38
|
+
/^(jan(uary?)?|feb(ruary?)?|mar(ch|z)?|apr(il)?|ma[yi]|jun[ei]?)$/,
|
39
|
+
/^(jul[yi]?|aug(ust)?|sep(tember)?|o[ck]t(ober)?|nov(ember)?|de[cz](ember)?)$/i
|
40
|
+
:date
|
41
|
+
when /^(pmid|pmcid|arxiv|doi|url)/i
|
42
|
+
:locator
|
43
|
+
when /^(retrieved|accessed)$/i
|
44
|
+
:accessed
|
45
|
+
else
|
46
|
+
:none
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Line < Feature
|
4
|
+
def observe(token, page:, **opts)
|
5
|
+
chars = display_chars(token).rstrip
|
6
|
+
|
7
|
+
lttrs = count(chars, /\p{L}/)
|
8
|
+
upper = count(chars, /\p{Lu}/)
|
9
|
+
punct = count(chars, /[\p{Pd}:.,&\(\)"'”„’‚´«「『‘“`»」』]/)
|
10
|
+
white = count(chars, /\s/)
|
11
|
+
width = chars.length
|
12
|
+
|
13
|
+
[
|
14
|
+
lttrs,
|
15
|
+
width,
|
16
|
+
ratio(upper, lttrs),
|
17
|
+
ratio(lttrs, chars.length),
|
18
|
+
ratio(white, chars.length),
|
19
|
+
ratio(punct, chars.length),
|
20
|
+
ratio(width, page.width),
|
21
|
+
classify(chars)
|
22
|
+
]
|
23
|
+
end
|
24
|
+
|
25
|
+
def classify(chars)
|
26
|
+
case chars
|
27
|
+
when /\.\s*\.\s*\.\s*\.|……+/
|
28
|
+
:toc
|
29
|
+
when /\s\s\s\d+$/
|
30
|
+
:num
|
31
|
+
when /^\s*(Table|Fig(ure|\.))/
|
32
|
+
:cap
|
33
|
+
else
|
34
|
+
:none
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
module AnyStyle
|
4
|
+
class Feature
|
5
|
+
class Locator < Feature
|
6
|
+
def observe(token, **opts)
|
7
|
+
case token
|
8
|
+
when /\b(DOI|doi|ISBN|Url|URL|PMCID|PMID|PMC\d+|PubMed)\b/,
|
9
|
+
/10.\d{4,9}\/[-._;()\/:A-Z0-9]+/i,
|
10
|
+
URI.regexp
|
11
|
+
'T'
|
12
|
+
else
|
13
|
+
'F'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Number < Feature
|
4
|
+
def observe(token, **opts)
|
5
|
+
case token
|
6
|
+
when /\d[\(:;]\d/
|
7
|
+
:volume
|
8
|
+
when /^97[89](\p{Pd}?\d){10}$/,
|
9
|
+
/^\d(\p{Pd}?\d){9}$/
|
10
|
+
:isbn
|
11
|
+
when /\b(1\d|20)\d\d\b/
|
12
|
+
:year
|
13
|
+
when /^\d\d\d\d$/
|
14
|
+
:quad
|
15
|
+
when /^\d\d\d$/
|
16
|
+
:triple
|
17
|
+
when /^\d\d$/
|
18
|
+
:double
|
19
|
+
when /^\d$/
|
20
|
+
:single
|
21
|
+
when /^\d+$/
|
22
|
+
:all
|
23
|
+
when /^\d+\p{Pd}+\d+$/
|
24
|
+
:range
|
25
|
+
when /^\p{Lu}[\p{Lu}\p{Pd}\/]+\d+[,.:]?$/
|
26
|
+
:idnum
|
27
|
+
when /\d\p{Alpha}{1,3}\b/i
|
28
|
+
:ordinal
|
29
|
+
when /\d/
|
30
|
+
:numeric
|
31
|
+
when /^([IVXLDCM]+|[ivx]+)\b/
|
32
|
+
:roman
|
33
|
+
else
|
34
|
+
:none
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Position < Feature
|
4
|
+
attr_reader :idx, :seq
|
5
|
+
|
6
|
+
def initialize(idx: :idx, seq: :seq, **opts)
|
7
|
+
super(opts)
|
8
|
+
@idx, @seq = idx, seq
|
9
|
+
end
|
10
|
+
|
11
|
+
def observe(token, **opts)
|
12
|
+
i = opts[idx]
|
13
|
+
n = opts[seq].size
|
14
|
+
|
15
|
+
case
|
16
|
+
when i == 0 && i == n - 1
|
17
|
+
:only
|
18
|
+
when i == 0
|
19
|
+
:first
|
20
|
+
when i == n - 1
|
21
|
+
:last
|
22
|
+
else
|
23
|
+
ratio i, n
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Punctuation < Feature
|
4
|
+
def observe(token, **opts)
|
5
|
+
case token
|
6
|
+
when /^\p{^P}+$/
|
7
|
+
:none
|
8
|
+
when /:/
|
9
|
+
:colon
|
10
|
+
when /\p{Pd}/
|
11
|
+
:hyphen
|
12
|
+
when /\./
|
13
|
+
:period
|
14
|
+
when /&/
|
15
|
+
:amp
|
16
|
+
else
|
17
|
+
:other
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Quotes < Feature
|
4
|
+
def observe(token, **opts)
|
5
|
+
case token
|
6
|
+
when /^[^"'”„’‚´«「『‘“`»」』]+$/
|
7
|
+
:none
|
8
|
+
when /^["'”„’‚´«「『‘“`»].*["'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
|
9
|
+
:'quote-unquote'
|
10
|
+
when /^["'”„’‚´«「『‘“`»]/
|
11
|
+
:quote
|
12
|
+
when /["'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
|
13
|
+
:unquote
|
14
|
+
else
|
15
|
+
:other
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|