anystyle 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/HISTORY.md +78 -0
  3. data/LICENSE +27 -0
  4. data/README.md +103 -0
  5. data/lib/anystyle.rb +71 -0
  6. data/lib/anystyle/dictionary.rb +132 -0
  7. data/lib/anystyle/dictionary/gdbm.rb +52 -0
  8. data/lib/anystyle/dictionary/lmdb.rb +67 -0
  9. data/lib/anystyle/dictionary/marshal.rb +27 -0
  10. data/lib/anystyle/dictionary/redis.rb +55 -0
  11. data/lib/anystyle/document.rb +264 -0
  12. data/lib/anystyle/errors.rb +14 -0
  13. data/lib/anystyle/feature.rb +27 -0
  14. data/lib/anystyle/feature/affix.rb +43 -0
  15. data/lib/anystyle/feature/brackets.rb +32 -0
  16. data/lib/anystyle/feature/canonical.rb +13 -0
  17. data/lib/anystyle/feature/caps.rb +20 -0
  18. data/lib/anystyle/feature/category.rb +70 -0
  19. data/lib/anystyle/feature/dictionary.rb +16 -0
  20. data/lib/anystyle/feature/indent.rb +16 -0
  21. data/lib/anystyle/feature/keyword.rb +52 -0
  22. data/lib/anystyle/feature/line.rb +39 -0
  23. data/lib/anystyle/feature/locator.rb +18 -0
  24. data/lib/anystyle/feature/number.rb +39 -0
  25. data/lib/anystyle/feature/position.rb +28 -0
  26. data/lib/anystyle/feature/punctuation.rb +22 -0
  27. data/lib/anystyle/feature/quotes.rb +20 -0
  28. data/lib/anystyle/feature/ref.rb +21 -0
  29. data/lib/anystyle/feature/terminal.rb +19 -0
  30. data/lib/anystyle/feature/words.rb +74 -0
  31. data/lib/anystyle/finder.rb +94 -0
  32. data/lib/anystyle/format/bibtex.rb +63 -0
  33. data/lib/anystyle/format/csl.rb +28 -0
  34. data/lib/anystyle/normalizer.rb +65 -0
  35. data/lib/anystyle/normalizer/brackets.rb +13 -0
  36. data/lib/anystyle/normalizer/container.rb +13 -0
  37. data/lib/anystyle/normalizer/date.rb +109 -0
  38. data/lib/anystyle/normalizer/edition.rb +16 -0
  39. data/lib/anystyle/normalizer/journal.rb +14 -0
  40. data/lib/anystyle/normalizer/locale.rb +30 -0
  41. data/lib/anystyle/normalizer/location.rb +24 -0
  42. data/lib/anystyle/normalizer/locator.rb +22 -0
  43. data/lib/anystyle/normalizer/names.rb +88 -0
  44. data/lib/anystyle/normalizer/page.rb +29 -0
  45. data/lib/anystyle/normalizer/publisher.rb +18 -0
  46. data/lib/anystyle/normalizer/pubmed.rb +18 -0
  47. data/lib/anystyle/normalizer/punctuation.rb +23 -0
  48. data/lib/anystyle/normalizer/quotes.rb +14 -0
  49. data/lib/anystyle/normalizer/type.rb +54 -0
  50. data/lib/anystyle/normalizer/volume.rb +26 -0
  51. data/lib/anystyle/parser.rb +199 -0
  52. data/lib/anystyle/support.rb +4 -0
  53. data/lib/anystyle/support/finder.mod +3234 -0
  54. data/lib/anystyle/support/finder.txt +75 -0
  55. data/lib/anystyle/support/parser.mod +15025 -0
  56. data/lib/anystyle/support/parser.txt +75 -0
  57. data/lib/anystyle/utils.rb +70 -0
  58. data/lib/anystyle/version.rb +3 -0
  59. data/res/finder/bb132pr2055.ttx +6803 -0
  60. data/res/finder/bb550sh8053.ttx +18660 -0
  61. data/res/finder/bb599nz4341.ttx +2957 -0
  62. data/res/finder/bb725rt6501.ttx +15276 -0
  63. data/res/finder/bc605xz1554.ttx +18815 -0
  64. data/res/finder/bd040gx5718.ttx +4271 -0
  65. data/res/finder/bd413nt2715.ttx +4956 -0
  66. data/res/finder/bd466fq0394.ttx +6100 -0
  67. data/res/finder/bf668vw2021.ttx +3578 -0
  68. data/res/finder/bg495cx0468.ttx +7267 -0
  69. data/res/finder/bg599vt3743.ttx +6752 -0
  70. data/res/finder/bg608dx2253.ttx +4094 -0
  71. data/res/finder/bh410qk3771.ttx +8785 -0
  72. data/res/finder/bh989ww6442.ttx +17204 -0
  73. data/res/finder/bj581pc8202.ttx +2719 -0
  74. data/res/parser/bad.xml +5199 -0
  75. data/res/parser/core.xml +7924 -0
  76. data/res/parser/gold.xml +2707 -0
  77. data/res/parser/good.xml +34281 -0
  78. data/res/parser/stanford-books.xml +2280 -0
  79. data/res/parser/stanford-diss.xml +726 -0
  80. data/res/parser/stanford-theses.xml +4684 -0
  81. data/res/parser/ugly.xml +33246 -0
  82. metadata +195 -0
@@ -0,0 +1,21 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Ref < Feature
4
+ def observe(token, **opts)
5
+ [
6
+ symbolize(count(token, /\b(1\d|20)\d\d\b/)),
7
+ symbolize(count(token, /(\d[\(:;]\d)|(\d\s*\p{Pd}+\s*\d)|\bpp?\.|\bvols?\.|\bnos?\./i)),
8
+ symbolize(count(token, /\b\p{Lu}\./)),
9
+ symbolize(count(token, /\b(eds?\.|edited by|editors?|hg|hrsg|et al)\b/i)),
10
+ token =~ /^\s*(\[\w+\]|\(\d+\)|\d+\.)\s+/ ? 'T' : 'F'
11
+ ]
12
+ end
13
+
14
+ def symbolize(k)
15
+ return '-' if k < 1
16
+ return '+' if k < 3
17
+ return '++'
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,19 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Terminal < Feature
4
+ def observe(token, **opts)
5
+ case token
6
+ when /[\.\)\]]["'”„’‚´«‘“`»」』\)\]]?$/,
7
+ /,["'”„’‚´«‘“`»」』\)\]]|["'”„’‚´«‘“`»」』\)\]],$/
8
+ :strong
9
+ when /[:"'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
10
+ :moderate
11
+ when /[!\?,;\p{Pd}]["'”„’‚´«‘“`»」』]?$/
12
+ :weak
13
+ else
14
+ :none
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,74 @@
1
+ module AnyStyle
2
+ class Feature
3
+ class Words < Feature
4
+ attr_reader :dictionary
5
+
6
+ TITLE_WORDS = %w{
7
+ abstract
8
+ acknowledgements
9
+ appendix
10
+ bibliography
11
+ bibliographie
12
+ chapter
13
+ cited
14
+ contents
15
+ figures
16
+ introduction
17
+ literatur
18
+ literature
19
+ references
20
+ referenzen
21
+ section
22
+ tables
23
+ works
24
+ }
25
+
26
+ def initialize(dictionary:, **opts)
27
+ super(**opts)
28
+ @dictionary = dictionary
29
+ end
30
+
31
+ def observe(token, **opts)
32
+ words = token.scan(/\S+/).map { |word| canonize word }.reject(&:empty?)
33
+ spacers = token.scan(/\S\s\s+\S/)
34
+ numbers = token.scan(/\d+(\.\d+)?/)
35
+ title = words.count { |word| TITLE_WORDS.include?(word) }
36
+ counts = dictionary.tag_counts(words)
37
+
38
+ if words.length > 0
39
+ len = words.map(&:length).sort
40
+ avg = len.reduce(0, :+) / len.length
41
+ med = len.length.even? ?
42
+ len[(len.length - 1) / 2, 2].reduce(0, :+) / 2 :
43
+ len[len.length / 2]
44
+ else
45
+ avg, med = 0, 0
46
+ end
47
+
48
+ [
49
+ words.length,
50
+ avg,
51
+ med,
52
+ spacers.length,
53
+ classify(words[0]),
54
+ numbers.length,
55
+ ratio(title, words.length),
56
+ *counts.map { |cnt| ratio(cnt, words.length) }
57
+ ]
58
+ end
59
+
60
+ def classify(word)
61
+ case word
62
+ when /^(\d+|[vx]?iii?|i?[vx]|)$/i
63
+ :number
64
+ when /\d/
65
+ :numeric
66
+ when nil
67
+ :none
68
+ else
69
+ :alpha
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,94 @@
1
+ module AnyStyle
2
+ class Finder < ParserCore
3
+ @formats = [:hash, :references, :wapiti]
4
+
5
+ @defaults = {
6
+ model: File.join(SUPPORT, 'finder.mod'),
7
+ pattern: File.join(SUPPORT, 'finder.txt'),
8
+ compact: true,
9
+ threads: 4,
10
+ format: :references,
11
+ training_data: Dir[File.join(RES, 'finder', '*.ttx')].map(&:untaint)
12
+ }
13
+
14
+ def initialize(options = {})
15
+ super(options)
16
+
17
+ @features = [
18
+ Feature::Line.new,
19
+ Feature::Category.new(strip: true),
20
+ Feature::Words.new(dictionary: options[:dictionary] || Dictionary.instance),
21
+ Feature::Indent.new,
22
+ Feature::Ref.new,
23
+ Feature::Position.new(seq: :page, idx: :ln),
24
+ Feature::Position.new(seq: :pages, idx: :pn)
25
+ ]
26
+ end
27
+
28
+ def expand(dataset)
29
+ dataset.each do |doc|
30
+ doc.each.with_index do |(line, ln, page, pn), idx|
31
+ line.observations = features.map.with_index { |f, fn|
32
+ f.observe line.value,
33
+ page: page,
34
+ pages: doc.pages,
35
+ seq: doc,
36
+ pn: pn,
37
+ ln: ln,
38
+ fn: fn,
39
+ idx: idx
40
+ }.flatten
41
+ end
42
+ end
43
+ end
44
+
45
+ def find(input, format: options[:format], **opts)
46
+ case format.to_sym
47
+ when :references, :ref
48
+ format_references(label(input, **opts), **opts)
49
+ when :hash
50
+ format_hash(label(input, **opts), **opts)
51
+ when :wapiti
52
+ label(input, **opts)
53
+ else
54
+ raise ArgumentError, "unknown format '#{format}'"
55
+ end
56
+ end
57
+
58
+ def format_hash(dataset, **opts)
59
+ dataset.map { |doc| doc.to_h(**opts) }
60
+ end
61
+
62
+ def format_references(dataset, **opts)
63
+ dataset.map { |doc| doc.references(**opts) }
64
+ end
65
+
66
+ def label(input, layout: true, **opts)
67
+ dataset = prepare(input, layout: layout, **opts)
68
+ output = model.label(dataset, **opts)
69
+ Wapiti::Dataset.new(dataset.map.with_index { |doc, idx|
70
+ doc.label(output[idx])
71
+ })
72
+ end
73
+
74
+ def prepare(input, layout: true, **opts)
75
+ case input
76
+ when String
77
+ super(Document.open(input, layout: layout, **opts), **opts)
78
+ when Array
79
+ super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **opts) }), **opts)
80
+ else
81
+ super(input, **opts)
82
+ end
83
+ end
84
+
85
+ def save_each(dataset, dir: '.', tagged: false, **opts)
86
+ dataset.each.with_index do |doc, idx|
87
+ name = doc.path.nil? ? idx : File.basename(doc.path, File.extname(doc.path))
88
+ file = "#{name}.#{tagged ? 'ttx' : 'txt'}"
89
+ File.write(File.join(dir, file), doc.to_s(tagged: tagged, **opts))
90
+ end
91
+ end
92
+
93
+ end
94
+ end
@@ -0,0 +1,63 @@
1
+ module AnyStyle
2
+ module Format
3
+ module BibTeX
4
+ TYPES = {
5
+ 'article-journal' => 'article',
6
+ 'chapter' => 'incollection',
7
+ 'manuscript' => 'unpublished',
8
+ 'paper-conference' => 'inproceedings',
9
+ 'report' => 'techreport'
10
+ }
11
+
12
+ def format_bibtex(dataset, **opts)
13
+ require 'bibtex'
14
+
15
+ b = ::BibTeX::Bibliography.new
16
+ format_hash(dataset).each do |hash|
17
+ flatten_values hash, skip: Normalizer::Names.keys
18
+
19
+ hash[:bibtex_type] = TYPES[hash[:type]] || hash[:type] || 'misc'
20
+ hash.delete :type
21
+
22
+ case hash[:bibtex_type]
23
+ when 'article'
24
+ rename_value hash, :'container-title', :journal
25
+ rename_value hash, :issue, :number
26
+ when 'techreport'
27
+ rename_value hash, :publisher, :institution
28
+ when 'thesis'
29
+ rename_value hash, :publisher, :school
30
+ end
31
+
32
+ Normalizer::Names.keys.each do |role|
33
+ names_to_bibtex hash, role
34
+ end
35
+
36
+ rename_value hash, :'collection-title', :series
37
+ rename_value hash, :'container-title', :booktitle
38
+ rename_value hash, :accessed, :urldate
39
+ rename_value hash, :genre, :type
40
+ rename_value hash, :location, :address
41
+
42
+ b << ::BibTeX::Entry.new(hash)
43
+ end
44
+ b
45
+ end
46
+
47
+ def names_to_bibtex(hash, role)
48
+ if hash.key?(role)
49
+ hash[role] = hash[role].map { |name|
50
+ case
51
+ when name.key?(:literal)
52
+ name[:literal]
53
+ when name.key?(:family) || name.key?(:given)
54
+ name.values_at(:family, :suffix, :given).compact.join(', ')
55
+ else
56
+ nil
57
+ end
58
+ }.compact.join(' and ')
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,28 @@
1
+ module AnyStyle
2
+ module Format
3
+ module CSL
4
+ def format_csl(dataset, **opts)
5
+ format_hash(dataset).map do |hash|
6
+ flatten_values hash, skip: Normalizer::Names.keys
7
+
8
+ rename_value hash, :pages, :page
9
+ rename_value hash, :location, :'publisher-place'
10
+ rename_value hash, :url, :URL
11
+ rename_value hash, :doi, :DOI
12
+ rename_value hash, :pmid, :PMID
13
+ rename_value hash, :pmcid, :PMCID
14
+
15
+ Normalizer::Names.keys.each do |role|
16
+ if hash.key?(role)
17
+ hash[role].reject! { |name| name[:others] }
18
+ end
19
+ end
20
+
21
+ hash
22
+ end
23
+ end
24
+
25
+ alias_method :format_citeproc, :format_csl
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,65 @@
1
+ module AnyStyle
2
+ class Normalizer
3
+ @keys = []
4
+
5
+ class << self
6
+ attr_reader :keys
7
+ end
8
+
9
+ attr_reader :keys
10
+ attr_accessor :skip
11
+
12
+ def initialize(keys: self.class.keys)
13
+ @keys = keys
14
+ @skip = false
15
+ end
16
+
17
+ def name
18
+ self.class.name
19
+ end
20
+
21
+ def normalize(item, **opts)
22
+ raise NotImplementedError
23
+ end
24
+
25
+ def append(item, key, value)
26
+ if item.key?(key)
27
+ item[key] << value
28
+ else
29
+ item[key] = [value]
30
+ end
31
+ end
32
+
33
+ def each_value(item, keys = keys_for(item))
34
+ keys.each do |key|
35
+ item[key].each do |value|
36
+ yield key, value
37
+ end if item.key?(key)
38
+ end
39
+ item
40
+ end
41
+
42
+ def map_values(item, keys = keys_for(item))
43
+ keys.each do |key|
44
+ if item.key?(key)
45
+ item[key] = item[key].map { |value|
46
+ yield key, value
47
+ }.flatten.reject { |v| v.nil? || v.empty? }
48
+ end
49
+ end
50
+ item
51
+ end
52
+
53
+ def keys_for(item)
54
+ if self.class.keys.empty?
55
+ item.keys
56
+ else
57
+ self.class.keys
58
+ end
59
+ end
60
+
61
+ def skip?
62
+ @skip
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,13 @@
1
+ module AnyStyle
2
+ class Normalizer
3
+ class Brackets < Normalizer
4
+ @keys = [:'citation-number', :note]
5
+
6
+ def normalize(item, **opts)
7
+ each_value(item) do |_, value|
8
+ value.gsub!(/^[\(\[\{]|[\]\)\}]$/, '')
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ module AnyStyle
2
+ class Normalizer
3
+ class Container < Normalizer
4
+ @keys = [:'container-title']
5
+
6
+ def normalize(item, **opts)
7
+ map_values(item) do |_, value|
8
+ value.sub(/^[Ii]n(?::|\s+the)?\s+(\p{^Ll})/, '\1')
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,109 @@
1
+ module AnyStyle
2
+ class Normalizer
3
+ class Date < Normalizer
4
+ @keys = [:date]
5
+
6
+ def normalize(item, **opts)
7
+ map_values(item) do |_, value|
8
+ case
9
+ when unknown?(value)
10
+ 'XXXX'
11
+ when interval?(value)
12
+ value
13
+ # TODO AD/BC
14
+ # TODO Seasons
15
+ when iso?(value)
16
+ value
17
+ else
18
+ year = extract_year(value)
19
+ unless year.nil?
20
+ month = extract_month_by_name(value)
21
+ day = extract_day(value) unless month.nil?
22
+ [
23
+ [year, month, day].compact.join('-'),
24
+ extract_uncertainty(value)
25
+ ].compact.join('')
26
+ else
27
+ value
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ def iso?(date)
34
+ date =~ /[012]\d\d\d-\d\d-\d\d/
35
+ end
36
+
37
+ def interval?(date)
38
+ date =~ /\/|\s\p{Pd}\s|(\s([12]?\d|30)\p{Pd}([12]?\d|3[01])?)/
39
+ end
40
+
41
+ def unknown?(date)
42
+ date =~ /inconnue|unknown|unbekannt|[ns]\. ?d\b|no date/i
43
+ end
44
+
45
+ def uncertain?(date)
46
+ date =~ /\?/
47
+ end
48
+
49
+ def approximate?(date)
50
+ date =~ /(\b(circa|ca\.|vers|approx))|(^[cv]\.)/i
51
+ end
52
+
53
+ def extract_uncertainty(date)
54
+ if approximate?(date)
55
+ uncertain?(date) ? '%' : '~'
56
+ else
57
+ uncertain?(date) ? '?' : nil
58
+ end
59
+ end
60
+
61
+ def extract_year(date)
62
+ if date =~ /\D?([012]\d\d\d)\D?/
63
+ $1
64
+ else
65
+ nil
66
+ end
67
+ end
68
+
69
+ def extract_day(date)
70
+ if date =~ /\b([012]?\d|3[01])\b/
71
+ '%02d' % $1.to_i
72
+ else
73
+ nil
74
+ end
75
+ end
76
+
77
+ def extract_month_by_name(date)
78
+ case date
79
+ when /\bjan/i
80
+ '01'
81
+ when /\bf(eb|év)/i
82
+ '02'
83
+ when /\bmar/i
84
+ '03'
85
+ when /\ba[pv]r/i
86
+ '04'
87
+ when /\bma[yi]/i
88
+ '05'
89
+ when /\bjui?n/i
90
+ '06'
91
+ when /\bjui?l/i
92
+ '07'
93
+ when /\ba(ug|oût)/i
94
+ '08'
95
+ when /\bsep/i
96
+ '09'
97
+ when /\bo[ck]t/i
98
+ '10'
99
+ when /\bnov/i
100
+ '11'
101
+ when /\bd[eé]c/i
102
+ '12'
103
+ else
104
+ nil
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end