anystyle 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/HISTORY.md +78 -0
  3. data/LICENSE +27 -0
  4. data/README.md +103 -0
  5. data/lib/anystyle.rb +71 -0
  6. data/lib/anystyle/dictionary.rb +132 -0
  7. data/lib/anystyle/dictionary/gdbm.rb +52 -0
  8. data/lib/anystyle/dictionary/lmdb.rb +67 -0
  9. data/lib/anystyle/dictionary/marshal.rb +27 -0
  10. data/lib/anystyle/dictionary/redis.rb +55 -0
  11. data/lib/anystyle/document.rb +264 -0
  12. data/lib/anystyle/errors.rb +14 -0
  13. data/lib/anystyle/feature.rb +27 -0
  14. data/lib/anystyle/feature/affix.rb +43 -0
  15. data/lib/anystyle/feature/brackets.rb +32 -0
  16. data/lib/anystyle/feature/canonical.rb +13 -0
  17. data/lib/anystyle/feature/caps.rb +20 -0
  18. data/lib/anystyle/feature/category.rb +70 -0
  19. data/lib/anystyle/feature/dictionary.rb +16 -0
  20. data/lib/anystyle/feature/indent.rb +16 -0
  21. data/lib/anystyle/feature/keyword.rb +52 -0
  22. data/lib/anystyle/feature/line.rb +39 -0
  23. data/lib/anystyle/feature/locator.rb +18 -0
  24. data/lib/anystyle/feature/number.rb +39 -0
  25. data/lib/anystyle/feature/position.rb +28 -0
  26. data/lib/anystyle/feature/punctuation.rb +22 -0
  27. data/lib/anystyle/feature/quotes.rb +20 -0
  28. data/lib/anystyle/feature/ref.rb +21 -0
  29. data/lib/anystyle/feature/terminal.rb +19 -0
  30. data/lib/anystyle/feature/words.rb +74 -0
  31. data/lib/anystyle/finder.rb +94 -0
  32. data/lib/anystyle/format/bibtex.rb +63 -0
  33. data/lib/anystyle/format/csl.rb +28 -0
  34. data/lib/anystyle/normalizer.rb +65 -0
  35. data/lib/anystyle/normalizer/brackets.rb +13 -0
  36. data/lib/anystyle/normalizer/container.rb +13 -0
  37. data/lib/anystyle/normalizer/date.rb +109 -0
  38. data/lib/anystyle/normalizer/edition.rb +16 -0
  39. data/lib/anystyle/normalizer/journal.rb +14 -0
  40. data/lib/anystyle/normalizer/locale.rb +30 -0
  41. data/lib/anystyle/normalizer/location.rb +24 -0
  42. data/lib/anystyle/normalizer/locator.rb +22 -0
  43. data/lib/anystyle/normalizer/names.rb +88 -0
  44. data/lib/anystyle/normalizer/page.rb +29 -0
  45. data/lib/anystyle/normalizer/publisher.rb +18 -0
  46. data/lib/anystyle/normalizer/pubmed.rb +18 -0
  47. data/lib/anystyle/normalizer/punctuation.rb +23 -0
  48. data/lib/anystyle/normalizer/quotes.rb +14 -0
  49. data/lib/anystyle/normalizer/type.rb +54 -0
  50. data/lib/anystyle/normalizer/volume.rb +26 -0
  51. data/lib/anystyle/parser.rb +199 -0
  52. data/lib/anystyle/support.rb +4 -0
  53. data/lib/anystyle/support/finder.mod +3234 -0
  54. data/lib/anystyle/support/finder.txt +75 -0
  55. data/lib/anystyle/support/parser.mod +15025 -0
  56. data/lib/anystyle/support/parser.txt +75 -0
  57. data/lib/anystyle/utils.rb +70 -0
  58. data/lib/anystyle/version.rb +3 -0
  59. data/res/finder/bb132pr2055.ttx +6803 -0
  60. data/res/finder/bb550sh8053.ttx +18660 -0
  61. data/res/finder/bb599nz4341.ttx +2957 -0
  62. data/res/finder/bb725rt6501.ttx +15276 -0
  63. data/res/finder/bc605xz1554.ttx +18815 -0
  64. data/res/finder/bd040gx5718.ttx +4271 -0
  65. data/res/finder/bd413nt2715.ttx +4956 -0
  66. data/res/finder/bd466fq0394.ttx +6100 -0
  67. data/res/finder/bf668vw2021.ttx +3578 -0
  68. data/res/finder/bg495cx0468.ttx +7267 -0
  69. data/res/finder/bg599vt3743.ttx +6752 -0
  70. data/res/finder/bg608dx2253.ttx +4094 -0
  71. data/res/finder/bh410qk3771.ttx +8785 -0
  72. data/res/finder/bh989ww6442.ttx +17204 -0
  73. data/res/finder/bj581pc8202.ttx +2719 -0
  74. data/res/parser/bad.xml +5199 -0
  75. data/res/parser/core.xml +7924 -0
  76. data/res/parser/gold.xml +2707 -0
  77. data/res/parser/good.xml +34281 -0
  78. data/res/parser/stanford-books.xml +2280 -0
  79. data/res/parser/stanford-diss.xml +726 -0
  80. data/res/parser/stanford-theses.xml +4684 -0
  81. data/res/parser/ugly.xml +33246 -0
  82. metadata +195 -0
@@ -0,0 +1,43 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Affix < Feature
4
+ attr_reader :size
5
+
6
+ def initialize(size: 4, prefix: true, suffix: false)
7
+ @size, @suffix = size, (suffix || !prefix)
8
+ end
9
+
10
+ def observe(token, **opts)
11
+ build(extract(token)) { |chars| join(chars) }
12
+ end
13
+
14
+ def extract(token)
15
+ if suffix?
16
+ token.chars.reverse.take(size)
17
+ else
18
+ token.chars.take(size)
19
+ end
20
+ end
21
+
22
+ def join(chars)
23
+ if suffix?
24
+ chars.reverse.join('')
25
+ else
26
+ chars.join('')
27
+ end
28
+ end
29
+
30
+ def build(chars)
31
+ (1..size).map { |n| yield chars.take(n) }
32
+ end
33
+
34
+ def suffix?
35
+ !!@suffix
36
+ end
37
+
38
+ def prefix?
39
+ !suffix?
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,32 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Brackets < Feature
4
+ def observe(token, **opts)
5
+ case token
6
+ when /^[^\(\[<>\)\]]+$/
7
+ :none
8
+ when /^\(.*\)[,;:\p{Pd}\.]?$/
9
+ :parens
10
+ when /^\[.*\][,;:\p{Pd}\.]?$/
11
+ :'square-brackets'
12
+ when /^<.*>[,;:\p{Pd}\.]?$/
13
+ :angle
14
+ when /\)[,;:\p{Pd}\.]?$/
15
+ :'closing-paren'
16
+ when /^\(/
17
+ :'opening-paren'
18
+ when /\][,;:\p{Pd}\.]?$/
19
+ :'closing-square-bracket'
20
+ when /^\[/
21
+ :'opening-square-bracket'
22
+ when />[,;:\p{Pd}\.]?$/
23
+ :'closing-angle'
24
+ when /^</
25
+ :'opening-angle'
26
+ else
27
+ :other
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,13 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Canonical < Feature
4
+ def observe(token, alpha:, **opts)
5
+ if alpha.empty?
6
+ :BLANK
7
+ else
8
+ canonize alpha
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,20 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Caps < Feature
4
+ def observe(_, alpha:, **opts)
5
+ case alpha
6
+ when /^\p{Upper}$/
7
+ :single
8
+ when /^\p{Upper}\p{Lower}/
9
+ :initial
10
+ when /^\p{Upper}+$/
11
+ :caps
12
+ when /^\p{Lower}+$/
13
+ :lower
14
+ else
15
+ :other
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,70 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Category < Feature
4
+ attr_reader :index
5
+
6
+ def initialize(index: [0, -1], strip: false)
7
+ @index, @strip = index, !!strip
8
+ end
9
+
10
+ def observe(token, **opts)
11
+ chars(token).values_at(*index).map { |char| categorize char }
12
+ end
13
+
14
+ def chars(token)
15
+ if strip?
16
+ token.strip.chars
17
+ else
18
+ token.chars
19
+ end
20
+ end
21
+
22
+ def categorize(char)
23
+ case char
24
+ when /\p{Lu}/
25
+ :Lu
26
+ when /\p{Ll}/
27
+ :Ll
28
+ when /\p{Lm}/
29
+ :Lm
30
+ when /\p{L}/
31
+ :L
32
+ when /\p{M}/
33
+ :M
34
+ when /\p{N}/
35
+ :N
36
+ when /\p{Pc}/
37
+ :Pc
38
+ when /\p{Pd}/
39
+ :Pd
40
+ when /\p{Ps}/
41
+ :Ps
42
+ when /\p{Pe}/
43
+ :Pe
44
+ when /\p{Pi}/
45
+ :Pi
46
+ when /\p{Pf}/
47
+ :Pf
48
+ when /\p{P}/
49
+ :P
50
+ when /\p{S}/
51
+ :S
52
+ when /\p{Zl}/
53
+ :Zl
54
+ when /\p{Zp}/
55
+ :Zp
56
+ when /\p{Z}/
57
+ :Z
58
+ when /\p{C}/
59
+ :C
60
+ else
61
+ :none
62
+ end
63
+ end
64
+
65
+ def strip?
66
+ @strip
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,16 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Dictionary < Feature
4
+ attr_reader :dictionary
5
+
6
+ def initialize(dictionary:, **opts)
7
+ super(**opts)
8
+ @dictionary = dictionary
9
+ end
10
+
11
+ def observe(token, alpha:, **opts)
12
+ dictionary.tags(alpha.downcase)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Indent < Feature
4
+ def observe(token, seq:, idx:, **opts)
5
+ i = indent(token)
6
+ p = prev(idx, seq)
7
+ j = p.nil? ? 0 : indent(p.value)
8
+
9
+ [
10
+ (i > 0) ? 'T' : 'F',
11
+ (i < j) ? '-' : (i > j) ? '+' : '=',
12
+ ]
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,52 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Keyword < Feature
4
+ def observe(token, alpha: token, **opts)
5
+ case token
6
+ when '&'
7
+ :and
8
+ else
9
+ case alpha
10
+ when /^ed(s|itors?|ited?|iteurs?)?$/i,
11
+ /^(hg|hrsg|herausgeber)$/i,
12
+ /^(compilador)$/i
13
+ :editor
14
+ when /^trans(l(ated|ators?|ation))?$/i,
15
+ /^übers(etz(t|ung))?$/i,
16
+ /^trad(uction|ucteurs?|uit)?$/i
17
+ :translator
18
+ when /^(dissertation|thesis)$/i
19
+ :thesis
20
+ when /^(proceedings|conference|meeting|transactions|communications|seminar|symposi(on|um))/i
21
+ :proceedings
22
+ when /^(Journal|Zeitschrift|Quarterly|Magazine?|Times|Rev(iew|vue)?|Bulletin|News|Week)/
23
+ :journal
24
+ when /^in$/i
25
+ :in
26
+ when /^([AaUu]nd|y|e)$/
27
+ :and
28
+ when /^(etal|others)$/
29
+ :etal
30
+ when /^(pp?|pages?|S(eiten?)?|ff?)$/
31
+ :page
32
+ when /^(vol(ume)?s?|iss(ue)?|n[or]?|number)$/i
33
+ :volume
34
+ when /^(edn|edition|expanded|rev(ised)?|p?reprint(ed)?|illustrated)$/i,
35
+ /^(aufl(age)?|\p{Alpha}*ausg(abe)?)$/i
36
+ :edition
37
+ when /^(nd|date|spring|s[uo]mmer|autumn|fall|winter|frühling|herbst)$/i,
38
+ /^(jan(uary?)?|feb(ruary?)?|mar(ch|z)?|apr(il)?|ma[yi]|jun[ei]?)$/,
39
+ /^(jul[yi]?|aug(ust)?|sep(tember)?|o[ck]t(ober)?|nov(ember)?|de[cz](ember)?)$/i
40
+ :date
41
+ when /^(pmid|pmcid|arxiv|doi|url)/i
42
+ :locator
43
+ when /^(retrieved|accessed)$/i
44
+ :accessed
45
+ else
46
+ :none
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,39 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Line < Feature
4
+ def observe(token, page:, **opts)
5
+ chars = display_chars(token).rstrip
6
+
7
+ lttrs = count(chars, /\p{L}/)
8
+ upper = count(chars, /\p{Lu}/)
9
+ punct = count(chars, /[\p{Pd}:.,&\(\)"'”„’‚´«「『‘“`»」』]/)
10
+ white = count(chars, /\s/)
11
+ width = chars.length
12
+
13
+ [
14
+ lttrs,
15
+ width,
16
+ ratio(upper, lttrs),
17
+ ratio(lttrs, chars.length),
18
+ ratio(white, chars.length),
19
+ ratio(punct, chars.length),
20
+ ratio(width, page.width),
21
+ classify(chars)
22
+ ]
23
+ end
24
+
25
+ def classify(chars)
26
+ case chars
27
+ when /\.\s*\.\s*\.\s*\.|……+/
28
+ :toc
29
+ when /\s\s\s\d+$/
30
+ :num
31
+ when /^\s*(Table|Fig(ure|\.))/
32
+ :cap
33
+ else
34
+ :none
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,18 @@
1
+ require 'uri'
2
+
3
+ module AnyStyle
4
+ class Feature
5
+ class Locator < Feature
6
+ def observe(token, **opts)
7
+ case token
8
+ when /\b(DOI|doi|ISBN|Url|URL|PMCID|PMID|PMC\d+|PubMed)\b/,
9
+ /10.\d{4,9}\/[-._;()\/:A-Z0-9]+/i,
10
+ URI.regexp
11
+ 'T'
12
+ else
13
+ 'F'
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,39 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Number < Feature
4
+ def observe(token, **opts)
5
+ case token
6
+ when /\d[\(:;]\d/
7
+ :volume
8
+ when /^97[89](\p{Pd}?\d){10}$/,
9
+ /^\d(\p{Pd}?\d){9}$/
10
+ :isbn
11
+ when /\b(1\d|20)\d\d\b/
12
+ :year
13
+ when /^\d\d\d\d$/
14
+ :quad
15
+ when /^\d\d\d$/
16
+ :triple
17
+ when /^\d\d$/
18
+ :double
19
+ when /^\d$/
20
+ :single
21
+ when /^\d+$/
22
+ :all
23
+ when /^\d+\p{Pd}+\d+$/
24
+ :range
25
+ when /^\p{Lu}[\p{Lu}\p{Pd}\/]+\d+[,.:]?$/
26
+ :idnum
27
+ when /\d\p{Alpha}{1,3}\b/i
28
+ :ordinal
29
+ when /\d/
30
+ :numeric
31
+ when /^([IVXLDCM]+|[ivx]+)\b/
32
+ :roman
33
+ else
34
+ :none
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,28 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Position < Feature
4
+ attr_reader :idx, :seq
5
+
6
+ def initialize(idx: :idx, seq: :seq, **opts)
7
+ super(opts)
8
+ @idx, @seq = idx, seq
9
+ end
10
+
11
+ def observe(token, **opts)
12
+ i = opts[idx]
13
+ n = opts[seq].size
14
+
15
+ case
16
+ when i == 0 && i == n - 1
17
+ :only
18
+ when i == 0
19
+ :first
20
+ when i == n - 1
21
+ :last
22
+ else
23
+ ratio i, n
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,22 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Punctuation < Feature
4
+ def observe(token, **opts)
5
+ case token
6
+ when /^\p{^P}+$/
7
+ :none
8
+ when /:/
9
+ :colon
10
+ when /\p{Pd}/
11
+ :hyphen
12
+ when /\./
13
+ :period
14
+ when /&/
15
+ :amp
16
+ else
17
+ :other
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,20 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Quotes < Feature
4
+ def observe(token, **opts)
5
+ case token
6
+ when /^[^"'”„’‚´«「『‘“`»」』]+$/
7
+ :none
8
+ when /^["'”„’‚´«「『‘“`»].*["'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
9
+ :'quote-unquote'
10
+ when /^["'”„’‚´«「『‘“`»]/
11
+ :quote
12
+ when /["'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
13
+ :unquote
14
+ else
15
+ :other
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end