anystyle 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/HISTORY.md +78 -0
  3. data/LICENSE +27 -0
  4. data/README.md +103 -0
  5. data/lib/anystyle.rb +71 -0
  6. data/lib/anystyle/dictionary.rb +132 -0
  7. data/lib/anystyle/dictionary/gdbm.rb +52 -0
  8. data/lib/anystyle/dictionary/lmdb.rb +67 -0
  9. data/lib/anystyle/dictionary/marshal.rb +27 -0
  10. data/lib/anystyle/dictionary/redis.rb +55 -0
  11. data/lib/anystyle/document.rb +264 -0
  12. data/lib/anystyle/errors.rb +14 -0
  13. data/lib/anystyle/feature.rb +27 -0
  14. data/lib/anystyle/feature/affix.rb +43 -0
  15. data/lib/anystyle/feature/brackets.rb +32 -0
  16. data/lib/anystyle/feature/canonical.rb +13 -0
  17. data/lib/anystyle/feature/caps.rb +20 -0
  18. data/lib/anystyle/feature/category.rb +70 -0
  19. data/lib/anystyle/feature/dictionary.rb +16 -0
  20. data/lib/anystyle/feature/indent.rb +16 -0
  21. data/lib/anystyle/feature/keyword.rb +52 -0
  22. data/lib/anystyle/feature/line.rb +39 -0
  23. data/lib/anystyle/feature/locator.rb +18 -0
  24. data/lib/anystyle/feature/number.rb +39 -0
  25. data/lib/anystyle/feature/position.rb +28 -0
  26. data/lib/anystyle/feature/punctuation.rb +22 -0
  27. data/lib/anystyle/feature/quotes.rb +20 -0
  28. data/lib/anystyle/feature/ref.rb +21 -0
  29. data/lib/anystyle/feature/terminal.rb +19 -0
  30. data/lib/anystyle/feature/words.rb +74 -0
  31. data/lib/anystyle/finder.rb +94 -0
  32. data/lib/anystyle/format/bibtex.rb +63 -0
  33. data/lib/anystyle/format/csl.rb +28 -0
  34. data/lib/anystyle/normalizer.rb +65 -0
  35. data/lib/anystyle/normalizer/brackets.rb +13 -0
  36. data/lib/anystyle/normalizer/container.rb +13 -0
  37. data/lib/anystyle/normalizer/date.rb +109 -0
  38. data/lib/anystyle/normalizer/edition.rb +16 -0
  39. data/lib/anystyle/normalizer/journal.rb +14 -0
  40. data/lib/anystyle/normalizer/locale.rb +30 -0
  41. data/lib/anystyle/normalizer/location.rb +24 -0
  42. data/lib/anystyle/normalizer/locator.rb +22 -0
  43. data/lib/anystyle/normalizer/names.rb +88 -0
  44. data/lib/anystyle/normalizer/page.rb +29 -0
  45. data/lib/anystyle/normalizer/publisher.rb +18 -0
  46. data/lib/anystyle/normalizer/pubmed.rb +18 -0
  47. data/lib/anystyle/normalizer/punctuation.rb +23 -0
  48. data/lib/anystyle/normalizer/quotes.rb +14 -0
  49. data/lib/anystyle/normalizer/type.rb +54 -0
  50. data/lib/anystyle/normalizer/volume.rb +26 -0
  51. data/lib/anystyle/parser.rb +199 -0
  52. data/lib/anystyle/support.rb +4 -0
  53. data/lib/anystyle/support/finder.mod +3234 -0
  54. data/lib/anystyle/support/finder.txt +75 -0
  55. data/lib/anystyle/support/parser.mod +15025 -0
  56. data/lib/anystyle/support/parser.txt +75 -0
  57. data/lib/anystyle/utils.rb +70 -0
  58. data/lib/anystyle/version.rb +3 -0
  59. data/res/finder/bb132pr2055.ttx +6803 -0
  60. data/res/finder/bb550sh8053.ttx +18660 -0
  61. data/res/finder/bb599nz4341.ttx +2957 -0
  62. data/res/finder/bb725rt6501.ttx +15276 -0
  63. data/res/finder/bc605xz1554.ttx +18815 -0
  64. data/res/finder/bd040gx5718.ttx +4271 -0
  65. data/res/finder/bd413nt2715.ttx +4956 -0
  66. data/res/finder/bd466fq0394.ttx +6100 -0
  67. data/res/finder/bf668vw2021.ttx +3578 -0
  68. data/res/finder/bg495cx0468.ttx +7267 -0
  69. data/res/finder/bg599vt3743.ttx +6752 -0
  70. data/res/finder/bg608dx2253.ttx +4094 -0
  71. data/res/finder/bh410qk3771.ttx +8785 -0
  72. data/res/finder/bh989ww6442.ttx +17204 -0
  73. data/res/finder/bj581pc8202.ttx +2719 -0
  74. data/res/parser/bad.xml +5199 -0
  75. data/res/parser/core.xml +7924 -0
  76. data/res/parser/gold.xml +2707 -0
  77. data/res/parser/good.xml +34281 -0
  78. data/res/parser/stanford-books.xml +2280 -0
  79. data/res/parser/stanford-diss.xml +726 -0
  80. data/res/parser/stanford-theses.xml +4684 -0
  81. data/res/parser/ugly.xml +33246 -0
  82. metadata +195 -0
@@ -0,0 +1,43 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Affix < Feature
4
+ attr_reader :size
5
+
6
+ def initialize(size: 4, prefix: true, suffix: false)
7
+ @size, @suffix = size, (suffix || !prefix)
8
+ end
9
+
10
+ def observe(token, **opts)
11
+ build(extract(token)) { |chars| join(chars) }
12
+ end
13
+
14
+ def extract(token)
15
+ if suffix?
16
+ token.chars.reverse.take(size)
17
+ else
18
+ token.chars.take(size)
19
+ end
20
+ end
21
+
22
+ def join(chars)
23
+ if suffix?
24
+ chars.reverse.join('')
25
+ else
26
+ chars.join('')
27
+ end
28
+ end
29
+
30
+ def build(chars)
31
+ (1..size).map { |n| yield chars.take(n) }
32
+ end
33
+
34
+ def suffix?
35
+ !!@suffix
36
+ end
37
+
38
+ def prefix?
39
+ !suffix?
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,32 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Brackets < Feature
4
+ def observe(token, **opts)
5
+ case token
6
+ when /^[^\(\[<>\)\]]+$/
7
+ :none
8
+ when /^\(.*\)[,;:\p{Pd}\.]?$/
9
+ :parens
10
+ when /^\[.*\][,;:\p{Pd}\.]?$/
11
+ :'square-brackets'
12
+ when /^<.*>[,;:\p{Pd}\.]?$/
13
+ :angle
14
+ when /\)[,;:\p{Pd}\.]?$/
15
+ :'closing-paren'
16
+ when /^\(/
17
+ :'opening-paren'
18
+ when /\][,;:\p{Pd}\.]?$/
19
+ :'closing-square-bracket'
20
+ when /^\[/
21
+ :'opening-square-bracket'
22
+ when />[,;:\p{Pd}\.]?$/
23
+ :'closing-angle'
24
+ when /^</
25
+ :'opening-angle'
26
+ else
27
+ :other
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,13 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Canonical < Feature
4
+ def observe(token, alpha:, **opts)
5
+ if alpha.empty?
6
+ :BLANK
7
+ else
8
+ canonize alpha
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,20 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Caps < Feature
4
+ def observe(_, alpha:, **opts)
5
+ case alpha
6
+ when /^\p{Upper}$/
7
+ :single
8
+ when /^\p{Upper}\p{Lower}/
9
+ :initial
10
+ when /^\p{Upper}+$/
11
+ :caps
12
+ when /^\p{Lower}+$/
13
+ :lower
14
+ else
15
+ :other
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,70 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Category < Feature
4
+ attr_reader :index
5
+
6
+ def initialize(index: [0, -1], strip: false)
7
+ @index, @strip = index, !!strip
8
+ end
9
+
10
+ def observe(token, **opts)
11
+ chars(token).values_at(*index).map { |char| categorize char }
12
+ end
13
+
14
+ def chars(token)
15
+ if strip?
16
+ token.strip.chars
17
+ else
18
+ token.chars
19
+ end
20
+ end
21
+
22
+ def categorize(char)
23
+ case char
24
+ when /\p{Lu}/
25
+ :Lu
26
+ when /\p{Ll}/
27
+ :Ll
28
+ when /\p{Lm}/
29
+ :Lm
30
+ when /\p{L}/
31
+ :L
32
+ when /\p{M}/
33
+ :M
34
+ when /\p{N}/
35
+ :N
36
+ when /\p{Pc}/
37
+ :Pc
38
+ when /\p{Pd}/
39
+ :Pd
40
+ when /\p{Ps}/
41
+ :Ps
42
+ when /\p{Pe}/
43
+ :Pe
44
+ when /\p{Pi}/
45
+ :Pi
46
+ when /\p{Pf}/
47
+ :Pf
48
+ when /\p{P}/
49
+ :P
50
+ when /\p{S}/
51
+ :S
52
+ when /\p{Zl}/
53
+ :Zl
54
+ when /\p{Zp}/
55
+ :Zp
56
+ when /\p{Z}/
57
+ :Z
58
+ when /\p{C}/
59
+ :C
60
+ else
61
+ :none
62
+ end
63
+ end
64
+
65
+ def strip?
66
+ @strip
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,16 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Dictionary < Feature
4
+ attr_reader :dictionary
5
+
6
+ def initialize(dictionary:, **opts)
7
+ super(**opts)
8
+ @dictionary = dictionary
9
+ end
10
+
11
+ def observe(token, alpha:, **opts)
12
+ dictionary.tags(alpha.downcase)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Indent < Feature
4
+ def observe(token, seq:, idx:, **opts)
5
+ i = indent(token)
6
+ p = prev(idx, seq)
7
+ j = p.nil? ? 0 : indent(p.value)
8
+
9
+ [
10
+ (i > 0) ? 'T' : 'F',
11
+ (i < j) ? '-' : (i > j) ? '+' : '=',
12
+ ]
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,52 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Keyword < Feature
4
+ def observe(token, alpha: token, **opts)
5
+ case token
6
+ when '&'
7
+ :and
8
+ else
9
+ case alpha
10
+ when /^ed(s|itors?|ited?|iteurs?)?$/i,
11
+ /^(hg|hrsg|herausgeber)$/i,
12
+ /^(compilador)$/i
13
+ :editor
14
+ when /^trans(l(ated|ators?|ation))?$/i,
15
+ /^übers(etz(t|ung))?$/i,
16
+ /^trad(uction|ucteurs?|uit)?$/i
17
+ :translator
18
+ when /^(dissertation|thesis)$/i
19
+ :thesis
20
+ when /^(proceedings|conference|meeting|transactions|communications|seminar|symposi(on|um))/i
21
+ :proceedings
22
+ when /^(Journal|Zeitschrift|Quarterly|Magazine?|Times|Rev(iew|vue)?|Bulletin|News|Week)/
23
+ :journal
24
+ when /^in$/i
25
+ :in
26
+ when /^([AaUu]nd|y|e)$/
27
+ :and
28
+ when /^(etal|others)$/
29
+ :etal
30
+ when /^(pp?|pages?|S(eiten?)?|ff?)$/
31
+ :page
32
+ when /^(vol(ume)?s?|iss(ue)?|n[or]?|number)$/i
33
+ :volume
34
+ when /^(edn|edition|expanded|rev(ised)?|p?reprint(ed)?|illustrated)$/i,
35
+ /^(aufl(age)?|\p{Alpha}*ausg(abe)?)$/i
36
+ :edition
37
+ when /^(nd|date|spring|s[uo]mmer|autumn|fall|winter|frühling|herbst)$/i,
38
+ /^(jan(uary?)?|feb(ruary?)?|mar(ch|z)?|apr(il)?|ma[yi]|jun[ei]?)$/,
39
+ /^(jul[yi]?|aug(ust)?|sep(tember)?|o[ck]t(ober)?|nov(ember)?|de[cz](ember)?)$/i
40
+ :date
41
+ when /^(pmid|pmcid|arxiv|doi|url)/i
42
+ :locator
43
+ when /^(retrieved|accessed)$/i
44
+ :accessed
45
+ else
46
+ :none
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,39 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Line < Feature
4
+ def observe(token, page:, **opts)
5
+ chars = display_chars(token).rstrip
6
+
7
+ lttrs = count(chars, /\p{L}/)
8
+ upper = count(chars, /\p{Lu}/)
9
+ punct = count(chars, /[\p{Pd}:.,&\(\)"'”„’‚´«「『‘“`»」』]/)
10
+ white = count(chars, /\s/)
11
+ width = chars.length
12
+
13
+ [
14
+ lttrs,
15
+ width,
16
+ ratio(upper, lttrs),
17
+ ratio(lttrs, chars.length),
18
+ ratio(white, chars.length),
19
+ ratio(punct, chars.length),
20
+ ratio(width, page.width),
21
+ classify(chars)
22
+ ]
23
+ end
24
+
25
+ def classify(chars)
26
+ case chars
27
+ when /\.\s*\.\s*\.\s*\.|……+/
28
+ :toc
29
+ when /\s\s\s\d+$/
30
+ :num
31
+ when /^\s*(Table|Fig(ure|\.))/
32
+ :cap
33
+ else
34
+ :none
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,18 @@
1
+ require 'uri'
2
+
3
+ module AnyStyle
4
+ class Feature
5
+ class Locator < Feature
6
+ def observe(token, **opts)
7
+ case token
8
+ when /\b(DOI|doi|ISBN|Url|URL|PMCID|PMID|PMC\d+|PubMed)\b/,
9
+ /10.\d{4,9}\/[-._;()\/:A-Z0-9]+/i,
10
+ URI.regexp
11
+ 'T'
12
+ else
13
+ 'F'
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,39 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Number < Feature
4
+ def observe(token, **opts)
5
+ case token
6
+ when /\d[\(:;]\d/
7
+ :volume
8
+ when /^97[89](\p{Pd}?\d){10}$/,
9
+ /^\d(\p{Pd}?\d){9}$/
10
+ :isbn
11
+ when /\b(1\d|20)\d\d\b/
12
+ :year
13
+ when /^\d\d\d\d$/
14
+ :quad
15
+ when /^\d\d\d$/
16
+ :triple
17
+ when /^\d\d$/
18
+ :double
19
+ when /^\d$/
20
+ :single
21
+ when /^\d+$/
22
+ :all
23
+ when /^\d+\p{Pd}+\d+$/
24
+ :range
25
+ when /^\p{Lu}[\p{Lu}\p{Pd}\/]+\d+[,.:]?$/
26
+ :idnum
27
+ when /\d\p{Alpha}{1,3}\b/i
28
+ :ordinal
29
+ when /\d/
30
+ :numeric
31
+ when /^([IVXLDCM]+|[ivx]+)\b/
32
+ :roman
33
+ else
34
+ :none
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,28 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Position < Feature
4
+ attr_reader :idx, :seq
5
+
6
+ def initialize(idx: :idx, seq: :seq, **opts)
7
+ super(opts)
8
+ @idx, @seq = idx, seq
9
+ end
10
+
11
+ def observe(token, **opts)
12
+ i = opts[idx]
13
+ n = opts[seq].size
14
+
15
+ case
16
+ when i == 0 && i == n - 1
17
+ :only
18
+ when i == 0
19
+ :first
20
+ when i == n - 1
21
+ :last
22
+ else
23
+ ratio i, n
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,22 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Punctuation < Feature
4
+ def observe(token, **opts)
5
+ case token
6
+ when /^\p{^P}+$/
7
+ :none
8
+ when /:/
9
+ :colon
10
+ when /\p{Pd}/
11
+ :hyphen
12
+ when /\./
13
+ :period
14
+ when /&/
15
+ :amp
16
+ else
17
+ :other
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,20 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Quotes < Feature
4
+ def observe(token, **opts)
5
+ case token
6
+ when /^[^"'”„’‚´«「『‘“`»」』]+$/
7
+ :none
8
+ when /^["'”„’‚´«「『‘“`»].*["'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
9
+ :'quote-unquote'
10
+ when /^["'”„’‚´«「『‘“`»]/
11
+ :quote
12
+ when /["'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
13
+ :unquote
14
+ else
15
+ :other
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end