anystyle 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/HISTORY.md +78 -0
  3. data/LICENSE +27 -0
  4. data/README.md +103 -0
  5. data/lib/anystyle.rb +71 -0
  6. data/lib/anystyle/dictionary.rb +132 -0
  7. data/lib/anystyle/dictionary/gdbm.rb +52 -0
  8. data/lib/anystyle/dictionary/lmdb.rb +67 -0
  9. data/lib/anystyle/dictionary/marshal.rb +27 -0
  10. data/lib/anystyle/dictionary/redis.rb +55 -0
  11. data/lib/anystyle/document.rb +264 -0
  12. data/lib/anystyle/errors.rb +14 -0
  13. data/lib/anystyle/feature.rb +27 -0
  14. data/lib/anystyle/feature/affix.rb +43 -0
  15. data/lib/anystyle/feature/brackets.rb +32 -0
  16. data/lib/anystyle/feature/canonical.rb +13 -0
  17. data/lib/anystyle/feature/caps.rb +20 -0
  18. data/lib/anystyle/feature/category.rb +70 -0
  19. data/lib/anystyle/feature/dictionary.rb +16 -0
  20. data/lib/anystyle/feature/indent.rb +16 -0
  21. data/lib/anystyle/feature/keyword.rb +52 -0
  22. data/lib/anystyle/feature/line.rb +39 -0
  23. data/lib/anystyle/feature/locator.rb +18 -0
  24. data/lib/anystyle/feature/number.rb +39 -0
  25. data/lib/anystyle/feature/position.rb +28 -0
  26. data/lib/anystyle/feature/punctuation.rb +22 -0
  27. data/lib/anystyle/feature/quotes.rb +20 -0
  28. data/lib/anystyle/feature/ref.rb +21 -0
  29. data/lib/anystyle/feature/terminal.rb +19 -0
  30. data/lib/anystyle/feature/words.rb +74 -0
  31. data/lib/anystyle/finder.rb +94 -0
  32. data/lib/anystyle/format/bibtex.rb +63 -0
  33. data/lib/anystyle/format/csl.rb +28 -0
  34. data/lib/anystyle/normalizer.rb +65 -0
  35. data/lib/anystyle/normalizer/brackets.rb +13 -0
  36. data/lib/anystyle/normalizer/container.rb +13 -0
  37. data/lib/anystyle/normalizer/date.rb +109 -0
  38. data/lib/anystyle/normalizer/edition.rb +16 -0
  39. data/lib/anystyle/normalizer/journal.rb +14 -0
  40. data/lib/anystyle/normalizer/locale.rb +30 -0
  41. data/lib/anystyle/normalizer/location.rb +24 -0
  42. data/lib/anystyle/normalizer/locator.rb +22 -0
  43. data/lib/anystyle/normalizer/names.rb +88 -0
  44. data/lib/anystyle/normalizer/page.rb +29 -0
  45. data/lib/anystyle/normalizer/publisher.rb +18 -0
  46. data/lib/anystyle/normalizer/pubmed.rb +18 -0
  47. data/lib/anystyle/normalizer/punctuation.rb +23 -0
  48. data/lib/anystyle/normalizer/quotes.rb +14 -0
  49. data/lib/anystyle/normalizer/type.rb +54 -0
  50. data/lib/anystyle/normalizer/volume.rb +26 -0
  51. data/lib/anystyle/parser.rb +199 -0
  52. data/lib/anystyle/support.rb +4 -0
  53. data/lib/anystyle/support/finder.mod +3234 -0
  54. data/lib/anystyle/support/finder.txt +75 -0
  55. data/lib/anystyle/support/parser.mod +15025 -0
  56. data/lib/anystyle/support/parser.txt +75 -0
  57. data/lib/anystyle/utils.rb +70 -0
  58. data/lib/anystyle/version.rb +3 -0
  59. data/res/finder/bb132pr2055.ttx +6803 -0
  60. data/res/finder/bb550sh8053.ttx +18660 -0
  61. data/res/finder/bb599nz4341.ttx +2957 -0
  62. data/res/finder/bb725rt6501.ttx +15276 -0
  63. data/res/finder/bc605xz1554.ttx +18815 -0
  64. data/res/finder/bd040gx5718.ttx +4271 -0
  65. data/res/finder/bd413nt2715.ttx +4956 -0
  66. data/res/finder/bd466fq0394.ttx +6100 -0
  67. data/res/finder/bf668vw2021.ttx +3578 -0
  68. data/res/finder/bg495cx0468.ttx +7267 -0
  69. data/res/finder/bg599vt3743.ttx +6752 -0
  70. data/res/finder/bg608dx2253.ttx +4094 -0
  71. data/res/finder/bh410qk3771.ttx +8785 -0
  72. data/res/finder/bh989ww6442.ttx +17204 -0
  73. data/res/finder/bj581pc8202.ttx +2719 -0
  74. data/res/parser/bad.xml +5199 -0
  75. data/res/parser/core.xml +7924 -0
  76. data/res/parser/gold.xml +2707 -0
  77. data/res/parser/good.xml +34281 -0
  78. data/res/parser/stanford-books.xml +2280 -0
  79. data/res/parser/stanford-diss.xml +726 -0
  80. data/res/parser/stanford-theses.xml +4684 -0
  81. data/res/parser/ugly.xml +33246 -0
  82. metadata +195 -0
@@ -0,0 +1,21 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Ref < Feature
4
+ def observe(token, **opts)
5
+ [
6
+ symbolize(count(token, /\b(1\d|20)\d\d\b/)),
7
+ symbolize(count(token, /(\d[\(:;]\d)|(\d\s*\p{Pd}+\s*\d)|\bpp?\.|\bvols?\.|\bnos?\./i)),
8
+ symbolize(count(token, /\b\p{Lu}\./)),
9
+ symbolize(count(token, /\b(eds?\.|edited by|editors?|hg|hrsg|et al)\b/i)),
10
+ token =~ /^\s*(\[\w+\]|\(\d+\)|\d+\.)\s+/ ? 'T' : 'F'
11
+ ]
12
+ end
13
+
14
+ def symbolize(k)
15
+ return '-' if k < 1
16
+ return '+' if k < 3
17
+ return '++'
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,19 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Terminal < Feature
4
+ def observe(token, **opts)
5
+ case token
6
+ when /[\.\)\]]["'”„’‚´«‘“`»」』\)\]]?$/,
7
+ /,["'”„’‚´«‘“`»」』\)\]]|["'”„’‚´«‘“`»」』\)\]],$/
8
+ :strong
9
+ when /[:"'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
10
+ :moderate
11
+ when /[!\?,;\p{Pd}]["'”„’‚´«‘“`»」』]?$/
12
+ :weak
13
+ else
14
+ :none
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,74 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Words < Feature
4
+ attr_reader :dictionary
5
+
6
+ TITLE_WORDS = %w{
7
+ abstract
8
+ acknowledgements
9
+ appendix
10
+ bibliography
11
+ bibliographie
12
+ chapter
13
+ cited
14
+ contents
15
+ figures
16
+ introduction
17
+ literatur
18
+ literature
19
+ references
20
+ referenzen
21
+ section
22
+ tables
23
+ works
24
+ }
25
+
26
+ def initialize(dictionary:, **opts)
27
+ super(**opts)
28
+ @dictionary = dictionary
29
+ end
30
+
31
+ def observe(token, **opts)
32
+ words = token.scan(/\S+/).map { |word| canonize word }.reject(&:empty?)
33
+ spacers = token.scan(/\S\s\s+\S/)
34
+ numbers = token.scan(/\d+(\.\d+)?/)
35
+ title = words.count { |word| TITLE_WORDS.include?(word) }
36
+ counts = dictionary.tag_counts(words)
37
+
38
+ if words.length > 0
39
+ len = words.map(&:length).sort
40
+ avg = len.reduce(0, :+) / len.length
41
+ med = len.length.even? ?
42
+ len[(len.length - 1) / 2, 2].reduce(0, :+) / 2 :
43
+ len[len.length / 2]
44
+ else
45
+ avg, med = 0, 0
46
+ end
47
+
48
+ [
49
+ words.length,
50
+ avg,
51
+ med,
52
+ spacers.length,
53
+ classify(words[0]),
54
+ numbers.length,
55
+ ratio(title, words.length),
56
+ *counts.map { |cnt| ratio(cnt, words.length) }
57
+ ]
58
+ end
59
+
60
+ def classify(word)
61
+ case word
62
+ when /^(\d+|[vx]?iii?|i?[vx]|)$/i
63
+ :number
64
+ when /\d/
65
+ :numeric
66
+ when nil
67
+ :none
68
+ else
69
+ :alpha
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,94 @@
1
+ module AnyStyle
2
+ class Finder < ParserCore
3
+ @formats = [:hash, :references, :wapiti]
4
+
5
+ @defaults = {
6
+ model: File.join(SUPPORT, 'finder.mod'),
7
+ pattern: File.join(SUPPORT, 'finder.txt'),
8
+ compact: true,
9
+ threads: 4,
10
+ format: :references,
11
+ training_data: Dir[File.join(RES, 'finder', '*.ttx')].map(&:untaint)
12
+ }
13
+
14
+ def initialize(options = {})
15
+ super(options)
16
+
17
+ @features = [
18
+ Feature::Line.new,
19
+ Feature::Category.new(strip: true),
20
+ Feature::Words.new(dictionary: options[:dictionary] || Dictionary.instance),
21
+ Feature::Indent.new,
22
+ Feature::Ref.new,
23
+ Feature::Position.new(seq: :page, idx: :ln),
24
+ Feature::Position.new(seq: :pages, idx: :pn)
25
+ ]
26
+ end
27
+
28
+ def expand(dataset)
29
+ dataset.each do |doc|
30
+ doc.each.with_index do |(line, ln, page, pn), idx|
31
+ line.observations = features.map.with_index { |f, fn|
32
+ f.observe line.value,
33
+ page: page,
34
+ pages: doc.pages,
35
+ seq: doc,
36
+ pn: pn,
37
+ ln: ln,
38
+ fn: fn,
39
+ idx: idx
40
+ }.flatten
41
+ end
42
+ end
43
+ end
44
+
45
+ def find(input, format: options[:format], **opts)
46
+ case format.to_sym
47
+ when :references, :ref
48
+ format_references(label(input, **opts), **opts)
49
+ when :hash
50
+ format_hash(label(input, **opts), **opts)
51
+ when :wapiti
52
+ label(input, **opts)
53
+ else
54
+ raise ArgumentError, "unknown format '#{format}'"
55
+ end
56
+ end
57
+
58
+ def format_hash(dataset, **opts)
59
+ dataset.map { |doc| doc.to_h(**opts) }
60
+ end
61
+
62
+ def format_references(dataset, **opts)
63
+ dataset.map { |doc| doc.references(**opts) }
64
+ end
65
+
66
+ def label(input, layout: true, **opts)
67
+ dataset = prepare(input, layout: layout, **opts)
68
+ output = model.label(dataset, **opts)
69
+ Wapiti::Dataset.new(dataset.map.with_index { |doc, idx|
70
+ doc.label(output[idx])
71
+ })
72
+ end
73
+
74
+ def prepare(input, layout: true, **opts)
75
+ case input
76
+ when String
77
+ super(Document.open(input, layout: layout, **opts), **opts)
78
+ when Array
79
+ super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **opts) }), **opts)
80
+ else
81
+ super(input, **opts)
82
+ end
83
+ end
84
+
85
+ def save_each(dataset, dir: '.', tagged: false, **opts)
86
+ dataset.each.with_index do |doc, idx|
87
+ name = doc.path.nil? ? idx : File.basename(doc.path, File.extname(doc.path))
88
+ file = "#{name}.#{tagged ? 'ttx' : 'txt'}"
89
+ File.write(File.join(dir, file), doc.to_s(tagged: tagged, **opts))
90
+ end
91
+ end
92
+
93
+ end
94
+ end
@@ -0,0 +1,63 @@
1
+ module AnyStyle
2
+ module Format
3
+ module BibTeX
4
+ TYPES = {
5
+ 'article-journal' => 'article',
6
+ 'chapter' => 'incollection',
7
+ 'manuscript' => 'unpublished',
8
+ 'paper-conference' => 'inproceedings',
9
+ 'report' => 'techreport'
10
+ }
11
+
12
+ def format_bibtex(dataset, **opts)
13
+ require 'bibtex'
14
+
15
+ b = ::BibTeX::Bibliography.new
16
+ format_hash(dataset).each do |hash|
17
+ flatten_values hash, skip: Normalizer::Names.keys
18
+
19
+ hash[:bibtex_type] = TYPES[hash[:type]] || hash[:type] || 'misc'
20
+ hash.delete :type
21
+
22
+ case hash[:bibtex_type]
23
+ when 'article'
24
+ rename_value hash, :'container-title', :journal
25
+ rename_value hash, :issue, :number
26
+ when 'techreport'
27
+ rename_value hash, :publisher, :institution
28
+ when 'thesis'
29
+ rename_value hash, :publisher, :school
30
+ end
31
+
32
+ Normalizer::Names.keys.each do |role|
33
+ names_to_bibtex hash, role
34
+ end
35
+
36
+ rename_value hash, :'collection-title', :series
37
+ rename_value hash, :'container-title', :booktitle
38
+ rename_value hash, :accessed, :urldate
39
+ rename_value hash, :genre, :type
40
+ rename_value hash, :location, :address
41
+
42
+ b << ::BibTeX::Entry.new(hash)
43
+ end
44
+ b
45
+ end
46
+
47
+ def names_to_bibtex(hash, role)
48
+ if hash.key?(role)
49
+ hash[role] = hash[role].map { |name|
50
+ case
51
+ when name.key?(:literal)
52
+ name[:literal]
53
+ when name.key?(:family) || name.key?(:given)
54
+ name.values_at(:family, :suffix, :given).compact.join(', ')
55
+ else
56
+ nil
57
+ end
58
+ }.compact.join(' and ')
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,28 @@
1
+ module AnyStyle
2
+ module Format
3
+ module CSL
4
+ def format_csl(dataset, **opts)
5
+ format_hash(dataset).map do |hash|
6
+ flatten_values hash, skip: Normalizer::Names.keys
7
+
8
+ rename_value hash, :pages, :page
9
+ rename_value hash, :location, :'publisher-place'
10
+ rename_value hash, :url, :URL
11
+ rename_value hash, :doi, :DOI
12
+ rename_value hash, :pmid, :PMID
13
+ rename_value hash, :pmcid, :PMCID
14
+
15
+ Normalizer::Names.keys.each do |role|
16
+ if hash.key?(role)
17
+ hash[role].reject! { |name| name[:others] }
18
+ end
19
+ end
20
+
21
+ hash
22
+ end
23
+ end
24
+
25
+ alias_method :format_citeproc, :format_csl
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,65 @@
1
+ module AnyStyle
2
+ class Normalizer
3
+ @keys = []
4
+
5
+ class << self
6
+ attr_reader :keys
7
+ end
8
+
9
+ attr_reader :keys
10
+ attr_accessor :skip
11
+
12
+ def initialize(keys: self.class.keys)
13
+ @keys = keys
14
+ @skip = false
15
+ end
16
+
17
+ def name
18
+ self.class.name
19
+ end
20
+
21
+ def normalize(item, **opts)
22
+ raise NotImplementedError
23
+ end
24
+
25
+ def append(item, key, value)
26
+ if item.key?(key)
27
+ item[key] << value
28
+ else
29
+ item[key] = [value]
30
+ end
31
+ end
32
+
33
+ def each_value(item, keys = keys_for(item))
34
+ keys.each do |key|
35
+ item[key].each do |value|
36
+ yield key, value
37
+ end if item.key?(key)
38
+ end
39
+ item
40
+ end
41
+
42
+ def map_values(item, keys = keys_for(item))
43
+ keys.each do |key|
44
+ if item.key?(key)
45
+ item[key] = item[key].map { |value|
46
+ yield key, value
47
+ }.flatten.reject { |v| v.nil? || v.empty? }
48
+ end
49
+ end
50
+ item
51
+ end
52
+
53
+ def keys_for(item)
54
+ if self.class.keys.empty?
55
+ item.keys
56
+ else
57
+ self.class.keys
58
+ end
59
+ end
60
+
61
+ def skip?
62
+ @skip
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,13 @@
1
+ module AnyStyle
2
+ class Normalizer
3
+ class Brackets < Normalizer
4
+ @keys = [:'citation-number', :note]
5
+
6
+ def normalize(item, **opts)
7
+ each_value(item) do |_, value|
8
+ value.gsub!(/^[\(\[\{]|[\]\)\}]$/, '')
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ module AnyStyle
2
+ class Normalizer
3
+ class Container < Normalizer
4
+ @keys = [:'container-title']
5
+
6
+ def normalize(item, **opts)
7
+ map_values(item) do |_, value|
8
+ value.sub(/^[Ii]n(?::|\s+the)?\s+(\p{^Ll})/, '\1')
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,109 @@
1
+ module AnyStyle
2
+ class Normalizer
3
+ class Date < Normalizer
4
+ @keys = [:date]
5
+
6
+ def normalize(item, **opts)
7
+ map_values(item) do |_, value|
8
+ case
9
+ when unknown?(value)
10
+ 'XXXX'
11
+ when interval?(value)
12
+ value
13
+ # TODO AD/BC
14
+ # TODO Seasons
15
+ when iso?(value)
16
+ value
17
+ else
18
+ year = extract_year(value)
19
+ unless year.nil?
20
+ month = extract_month_by_name(value)
21
+ day = extract_day(value) unless month.nil?
22
+ [
23
+ [year, month, day].compact.join('-'),
24
+ extract_uncertainty(value)
25
+ ].compact.join('')
26
+ else
27
+ value
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ def iso?(date)
34
+ date =~ /[012]\d\d\d-\d\d-\d\d/
35
+ end
36
+
37
+ def interval?(date)
38
+ date =~ /\/|\s\p{Pd}\s|(\s([12]?\d|30)\p{Pd}([12]?\d|3[01])?)/
39
+ end
40
+
41
+ def unknown?(date)
42
+ date =~ /inconnue|unknown|unbekannt|[ns]\. ?d\b|no date/i
43
+ end
44
+
45
+ def uncertain?(date)
46
+ date =~ /\?/
47
+ end
48
+
49
+ def approximate?(date)
50
+ date =~ /(\b(circa|ca\.|vers|approx))|(^[cv]\.)/i
51
+ end
52
+
53
+ def extract_uncertainty(date)
54
+ if approximate?(date)
55
+ uncertain?(date) ? '%' : '~'
56
+ else
57
+ uncertain?(date) ? '?' : nil
58
+ end
59
+ end
60
+
61
+ def extract_year(date)
62
+ if date =~ /\D?([012]\d\d\d)\D?/
63
+ $1
64
+ else
65
+ nil
66
+ end
67
+ end
68
+
69
+ def extract_day(date)
70
+ if date =~ /\b([012]?\d|3[01])\b/
71
+ '%02d' % $1.to_i
72
+ else
73
+ nil
74
+ end
75
+ end
76
+
77
+ def extract_month_by_name(date)
78
+ case date
79
+ when /\bjan/i
80
+ '01'
81
+ when /\bf(eb|év)/i
82
+ '02'
83
+ when /\bmar/i
84
+ '03'
85
+ when /\ba[pv]r/i
86
+ '04'
87
+ when /\bma[yi]/i
88
+ '05'
89
+ when /\bjui?n/i
90
+ '06'
91
+ when /\bjui?l/i
92
+ '07'
93
+ when /\ba(ug|oût)/i
94
+ '08'
95
+ when /\bsep/i
96
+ '09'
97
+ when /\bo[ck]t/i
98
+ '10'
99
+ when /\bnov/i
100
+ '11'
101
+ when /\bd[eé]c/i
102
+ '12'
103
+ else
104
+ nil
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end