anystyle 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/HISTORY.md +78 -0
- data/LICENSE +27 -0
- data/README.md +103 -0
- data/lib/anystyle.rb +71 -0
- data/lib/anystyle/dictionary.rb +132 -0
- data/lib/anystyle/dictionary/gdbm.rb +52 -0
- data/lib/anystyle/dictionary/lmdb.rb +67 -0
- data/lib/anystyle/dictionary/marshal.rb +27 -0
- data/lib/anystyle/dictionary/redis.rb +55 -0
- data/lib/anystyle/document.rb +264 -0
- data/lib/anystyle/errors.rb +14 -0
- data/lib/anystyle/feature.rb +27 -0
- data/lib/anystyle/feature/affix.rb +43 -0
- data/lib/anystyle/feature/brackets.rb +32 -0
- data/lib/anystyle/feature/canonical.rb +13 -0
- data/lib/anystyle/feature/caps.rb +20 -0
- data/lib/anystyle/feature/category.rb +70 -0
- data/lib/anystyle/feature/dictionary.rb +16 -0
- data/lib/anystyle/feature/indent.rb +16 -0
- data/lib/anystyle/feature/keyword.rb +52 -0
- data/lib/anystyle/feature/line.rb +39 -0
- data/lib/anystyle/feature/locator.rb +18 -0
- data/lib/anystyle/feature/number.rb +39 -0
- data/lib/anystyle/feature/position.rb +28 -0
- data/lib/anystyle/feature/punctuation.rb +22 -0
- data/lib/anystyle/feature/quotes.rb +20 -0
- data/lib/anystyle/feature/ref.rb +21 -0
- data/lib/anystyle/feature/terminal.rb +19 -0
- data/lib/anystyle/feature/words.rb +74 -0
- data/lib/anystyle/finder.rb +94 -0
- data/lib/anystyle/format/bibtex.rb +63 -0
- data/lib/anystyle/format/csl.rb +28 -0
- data/lib/anystyle/normalizer.rb +65 -0
- data/lib/anystyle/normalizer/brackets.rb +13 -0
- data/lib/anystyle/normalizer/container.rb +13 -0
- data/lib/anystyle/normalizer/date.rb +109 -0
- data/lib/anystyle/normalizer/edition.rb +16 -0
- data/lib/anystyle/normalizer/journal.rb +14 -0
- data/lib/anystyle/normalizer/locale.rb +30 -0
- data/lib/anystyle/normalizer/location.rb +24 -0
- data/lib/anystyle/normalizer/locator.rb +22 -0
- data/lib/anystyle/normalizer/names.rb +88 -0
- data/lib/anystyle/normalizer/page.rb +29 -0
- data/lib/anystyle/normalizer/publisher.rb +18 -0
- data/lib/anystyle/normalizer/pubmed.rb +18 -0
- data/lib/anystyle/normalizer/punctuation.rb +23 -0
- data/lib/anystyle/normalizer/quotes.rb +14 -0
- data/lib/anystyle/normalizer/type.rb +54 -0
- data/lib/anystyle/normalizer/volume.rb +26 -0
- data/lib/anystyle/parser.rb +199 -0
- data/lib/anystyle/support.rb +4 -0
- data/lib/anystyle/support/finder.mod +3234 -0
- data/lib/anystyle/support/finder.txt +75 -0
- data/lib/anystyle/support/parser.mod +15025 -0
- data/lib/anystyle/support/parser.txt +75 -0
- data/lib/anystyle/utils.rb +70 -0
- data/lib/anystyle/version.rb +3 -0
- data/res/finder/bb132pr2055.ttx +6803 -0
- data/res/finder/bb550sh8053.ttx +18660 -0
- data/res/finder/bb599nz4341.ttx +2957 -0
- data/res/finder/bb725rt6501.ttx +15276 -0
- data/res/finder/bc605xz1554.ttx +18815 -0
- data/res/finder/bd040gx5718.ttx +4271 -0
- data/res/finder/bd413nt2715.ttx +4956 -0
- data/res/finder/bd466fq0394.ttx +6100 -0
- data/res/finder/bf668vw2021.ttx +3578 -0
- data/res/finder/bg495cx0468.ttx +7267 -0
- data/res/finder/bg599vt3743.ttx +6752 -0
- data/res/finder/bg608dx2253.ttx +4094 -0
- data/res/finder/bh410qk3771.ttx +8785 -0
- data/res/finder/bh989ww6442.ttx +17204 -0
- data/res/finder/bj581pc8202.ttx +2719 -0
- data/res/parser/bad.xml +5199 -0
- data/res/parser/core.xml +7924 -0
- data/res/parser/gold.xml +2707 -0
- data/res/parser/good.xml +34281 -0
- data/res/parser/stanford-books.xml +2280 -0
- data/res/parser/stanford-diss.xml +726 -0
- data/res/parser/stanford-theses.xml +4684 -0
- data/res/parser/ugly.xml +33246 -0
- metadata +195 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Ref < Feature
|
4
|
+
def observe(token, **opts)
|
5
|
+
[
|
6
|
+
symbolize(count(token, /\b(1\d|20)\d\d\b/)),
|
7
|
+
symbolize(count(token, /(\d[\(:;]\d)|(\d\s*\p{Pd}+\s*\d)|\bpp?\.|\bvols?\.|\bnos?\./i)),
|
8
|
+
symbolize(count(token, /\b\p{Lu}\./)),
|
9
|
+
symbolize(count(token, /\b(eds?\.|edited by|editors?|hg|hrsg|et al)\b/i)),
|
10
|
+
token =~ /^\s*(\[\w+\]|\(\d+\)|\d+\.)\s+/ ? 'T' : 'F'
|
11
|
+
]
|
12
|
+
end
|
13
|
+
|
14
|
+
def symbolize(k)
|
15
|
+
return '-' if k < 1
|
16
|
+
return '+' if k < 3
|
17
|
+
return '++'
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Terminal < Feature
|
4
|
+
def observe(token, **opts)
|
5
|
+
case token
|
6
|
+
when /[\.\)\]]["'”„’‚´«‘“`»」』\)\]]?$/,
|
7
|
+
/,["'”„’‚´«‘“`»」』\)\]]|["'”„’‚´«‘“`»」』\)\]],$/
|
8
|
+
:strong
|
9
|
+
when /[:"'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
|
10
|
+
:moderate
|
11
|
+
when /[!\?,;\p{Pd}]["'”„’‚´«‘“`»」』]?$/
|
12
|
+
:weak
|
13
|
+
else
|
14
|
+
:none
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Words < Feature
|
4
|
+
attr_reader :dictionary
|
5
|
+
|
6
|
+
TITLE_WORDS = %w{
|
7
|
+
abstract
|
8
|
+
acknowledgements
|
9
|
+
appendix
|
10
|
+
bibliography
|
11
|
+
bibliographie
|
12
|
+
chapter
|
13
|
+
cited
|
14
|
+
contents
|
15
|
+
figures
|
16
|
+
introduction
|
17
|
+
literatur
|
18
|
+
literature
|
19
|
+
references
|
20
|
+
referenzen
|
21
|
+
section
|
22
|
+
tables
|
23
|
+
works
|
24
|
+
}
|
25
|
+
|
26
|
+
def initialize(dictionary:, **opts)
|
27
|
+
super(**opts)
|
28
|
+
@dictionary = dictionary
|
29
|
+
end
|
30
|
+
|
31
|
+
def observe(token, **opts)
|
32
|
+
words = token.scan(/\S+/).map { |word| canonize word }.reject(&:empty?)
|
33
|
+
spacers = token.scan(/\S\s\s+\S/)
|
34
|
+
numbers = token.scan(/\d+(\.\d+)?/)
|
35
|
+
title = words.count { |word| TITLE_WORDS.include?(word) }
|
36
|
+
counts = dictionary.tag_counts(words)
|
37
|
+
|
38
|
+
if words.length > 0
|
39
|
+
len = words.map(&:length).sort
|
40
|
+
avg = len.reduce(0, :+) / len.length
|
41
|
+
med = len.length.even? ?
|
42
|
+
len[(len.length - 1) / 2, 2].reduce(0, :+) / 2 :
|
43
|
+
len[len.length / 2]
|
44
|
+
else
|
45
|
+
avg, med = 0, 0
|
46
|
+
end
|
47
|
+
|
48
|
+
[
|
49
|
+
words.length,
|
50
|
+
avg,
|
51
|
+
med,
|
52
|
+
spacers.length,
|
53
|
+
classify(words[0]),
|
54
|
+
numbers.length,
|
55
|
+
ratio(title, words.length),
|
56
|
+
*counts.map { |cnt| ratio(cnt, words.length) }
|
57
|
+
]
|
58
|
+
end
|
59
|
+
|
60
|
+
def classify(word)
|
61
|
+
case word
|
62
|
+
when /^(\d+|[vx]?iii?|i?[vx]|)$/i
|
63
|
+
:number
|
64
|
+
when /\d/
|
65
|
+
:numeric
|
66
|
+
when nil
|
67
|
+
:none
|
68
|
+
else
|
69
|
+
:alpha
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Finder < ParserCore
|
3
|
+
@formats = [:hash, :references, :wapiti]
|
4
|
+
|
5
|
+
@defaults = {
|
6
|
+
model: File.join(SUPPORT, 'finder.mod'),
|
7
|
+
pattern: File.join(SUPPORT, 'finder.txt'),
|
8
|
+
compact: true,
|
9
|
+
threads: 4,
|
10
|
+
format: :references,
|
11
|
+
training_data: Dir[File.join(RES, 'finder', '*.ttx')].map(&:untaint)
|
12
|
+
}
|
13
|
+
|
14
|
+
def initialize(options = {})
|
15
|
+
super(options)
|
16
|
+
|
17
|
+
@features = [
|
18
|
+
Feature::Line.new,
|
19
|
+
Feature::Category.new(strip: true),
|
20
|
+
Feature::Words.new(dictionary: options[:dictionary] || Dictionary.instance),
|
21
|
+
Feature::Indent.new,
|
22
|
+
Feature::Ref.new,
|
23
|
+
Feature::Position.new(seq: :page, idx: :ln),
|
24
|
+
Feature::Position.new(seq: :pages, idx: :pn)
|
25
|
+
]
|
26
|
+
end
|
27
|
+
|
28
|
+
def expand(dataset)
|
29
|
+
dataset.each do |doc|
|
30
|
+
doc.each.with_index do |(line, ln, page, pn), idx|
|
31
|
+
line.observations = features.map.with_index { |f, fn|
|
32
|
+
f.observe line.value,
|
33
|
+
page: page,
|
34
|
+
pages: doc.pages,
|
35
|
+
seq: doc,
|
36
|
+
pn: pn,
|
37
|
+
ln: ln,
|
38
|
+
fn: fn,
|
39
|
+
idx: idx
|
40
|
+
}.flatten
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def find(input, format: options[:format], **opts)
|
46
|
+
case format.to_sym
|
47
|
+
when :references, :ref
|
48
|
+
format_references(label(input, **opts), **opts)
|
49
|
+
when :hash
|
50
|
+
format_hash(label(input, **opts), **opts)
|
51
|
+
when :wapiti
|
52
|
+
label(input, **opts)
|
53
|
+
else
|
54
|
+
raise ArgumentError, "unknown format '#{format}'"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def format_hash(dataset, **opts)
|
59
|
+
dataset.map { |doc| doc.to_h(**opts) }
|
60
|
+
end
|
61
|
+
|
62
|
+
def format_references(dataset, **opts)
|
63
|
+
dataset.map { |doc| doc.references(**opts) }
|
64
|
+
end
|
65
|
+
|
66
|
+
def label(input, layout: true, **opts)
|
67
|
+
dataset = prepare(input, layout: layout, **opts)
|
68
|
+
output = model.label(dataset, **opts)
|
69
|
+
Wapiti::Dataset.new(dataset.map.with_index { |doc, idx|
|
70
|
+
doc.label(output[idx])
|
71
|
+
})
|
72
|
+
end
|
73
|
+
|
74
|
+
def prepare(input, layout: true, **opts)
|
75
|
+
case input
|
76
|
+
when String
|
77
|
+
super(Document.open(input, layout: layout, **opts), **opts)
|
78
|
+
when Array
|
79
|
+
super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **opts) }), **opts)
|
80
|
+
else
|
81
|
+
super(input, **opts)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def save_each(dataset, dir: '.', tagged: false, **opts)
|
86
|
+
dataset.each.with_index do |doc, idx|
|
87
|
+
name = doc.path.nil? ? idx : File.basename(doc.path, File.extname(doc.path))
|
88
|
+
file = "#{name}.#{tagged ? 'ttx' : 'txt'}"
|
89
|
+
File.write(File.join(dir, file), doc.to_s(tagged: tagged, **opts))
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
module Format
|
3
|
+
module BibTeX
|
4
|
+
TYPES = {
|
5
|
+
'article-journal' => 'article',
|
6
|
+
'chapter' => 'incollection',
|
7
|
+
'manuscript' => 'unpublished',
|
8
|
+
'paper-conference' => 'inproceedings',
|
9
|
+
'report' => 'techreport'
|
10
|
+
}
|
11
|
+
|
12
|
+
def format_bibtex(dataset, **opts)
|
13
|
+
require 'bibtex'
|
14
|
+
|
15
|
+
b = ::BibTeX::Bibliography.new
|
16
|
+
format_hash(dataset).each do |hash|
|
17
|
+
flatten_values hash, skip: Normalizer::Names.keys
|
18
|
+
|
19
|
+
hash[:bibtex_type] = TYPES[hash[:type]] || hash[:type] || 'misc'
|
20
|
+
hash.delete :type
|
21
|
+
|
22
|
+
case hash[:bibtex_type]
|
23
|
+
when 'article'
|
24
|
+
rename_value hash, :'container-title', :journal
|
25
|
+
rename_value hash, :issue, :number
|
26
|
+
when 'techreport'
|
27
|
+
rename_value hash, :publisher, :institution
|
28
|
+
when 'thesis'
|
29
|
+
rename_value hash, :publisher, :school
|
30
|
+
end
|
31
|
+
|
32
|
+
Normalizer::Names.keys.each do |role|
|
33
|
+
names_to_bibtex hash, role
|
34
|
+
end
|
35
|
+
|
36
|
+
rename_value hash, :'collection-title', :series
|
37
|
+
rename_value hash, :'container-title', :booktitle
|
38
|
+
rename_value hash, :accessed, :urldate
|
39
|
+
rename_value hash, :genre, :type
|
40
|
+
rename_value hash, :location, :address
|
41
|
+
|
42
|
+
b << ::BibTeX::Entry.new(hash)
|
43
|
+
end
|
44
|
+
b
|
45
|
+
end
|
46
|
+
|
47
|
+
def names_to_bibtex(hash, role)
|
48
|
+
if hash.key?(role)
|
49
|
+
hash[role] = hash[role].map { |name|
|
50
|
+
case
|
51
|
+
when name.key?(:literal)
|
52
|
+
name[:literal]
|
53
|
+
when name.key?(:family) || name.key?(:given)
|
54
|
+
name.values_at(:family, :suffix, :given).compact.join(', ')
|
55
|
+
else
|
56
|
+
nil
|
57
|
+
end
|
58
|
+
}.compact.join(' and ')
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
module Format
|
3
|
+
module CSL
|
4
|
+
def format_csl(dataset, **opts)
|
5
|
+
format_hash(dataset).map do |hash|
|
6
|
+
flatten_values hash, skip: Normalizer::Names.keys
|
7
|
+
|
8
|
+
rename_value hash, :pages, :page
|
9
|
+
rename_value hash, :location, :'publisher-place'
|
10
|
+
rename_value hash, :url, :URL
|
11
|
+
rename_value hash, :doi, :DOI
|
12
|
+
rename_value hash, :pmid, :PMID
|
13
|
+
rename_value hash, :pmcid, :PMCID
|
14
|
+
|
15
|
+
Normalizer::Names.keys.each do |role|
|
16
|
+
if hash.key?(role)
|
17
|
+
hash[role].reject! { |name| name[:others] }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
hash
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
alias_method :format_citeproc, :format_csl
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
@keys = []
|
4
|
+
|
5
|
+
class << self
|
6
|
+
attr_reader :keys
|
7
|
+
end
|
8
|
+
|
9
|
+
attr_reader :keys
|
10
|
+
attr_accessor :skip
|
11
|
+
|
12
|
+
def initialize(keys: self.class.keys)
|
13
|
+
@keys = keys
|
14
|
+
@skip = false
|
15
|
+
end
|
16
|
+
|
17
|
+
def name
|
18
|
+
self.class.name
|
19
|
+
end
|
20
|
+
|
21
|
+
def normalize(item, **opts)
|
22
|
+
raise NotImplementedError
|
23
|
+
end
|
24
|
+
|
25
|
+
def append(item, key, value)
|
26
|
+
if item.key?(key)
|
27
|
+
item[key] << value
|
28
|
+
else
|
29
|
+
item[key] = [value]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def each_value(item, keys = keys_for(item))
|
34
|
+
keys.each do |key|
|
35
|
+
item[key].each do |value|
|
36
|
+
yield key, value
|
37
|
+
end if item.key?(key)
|
38
|
+
end
|
39
|
+
item
|
40
|
+
end
|
41
|
+
|
42
|
+
def map_values(item, keys = keys_for(item))
|
43
|
+
keys.each do |key|
|
44
|
+
if item.key?(key)
|
45
|
+
item[key] = item[key].map { |value|
|
46
|
+
yield key, value
|
47
|
+
}.flatten.reject { |v| v.nil? || v.empty? }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
item
|
51
|
+
end
|
52
|
+
|
53
|
+
def keys_for(item)
|
54
|
+
if self.class.keys.empty?
|
55
|
+
item.keys
|
56
|
+
else
|
57
|
+
self.class.keys
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def skip?
|
62
|
+
@skip
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
class Date < Normalizer
|
4
|
+
@keys = [:date]
|
5
|
+
|
6
|
+
def normalize(item, **opts)
|
7
|
+
map_values(item) do |_, value|
|
8
|
+
case
|
9
|
+
when unknown?(value)
|
10
|
+
'XXXX'
|
11
|
+
when interval?(value)
|
12
|
+
value
|
13
|
+
# TODO AD/BC
|
14
|
+
# TODO Seasons
|
15
|
+
when iso?(value)
|
16
|
+
value
|
17
|
+
else
|
18
|
+
year = extract_year(value)
|
19
|
+
unless year.nil?
|
20
|
+
month = extract_month_by_name(value)
|
21
|
+
day = extract_day(value) unless month.nil?
|
22
|
+
[
|
23
|
+
[year, month, day].compact.join('-'),
|
24
|
+
extract_uncertainty(value)
|
25
|
+
].compact.join('')
|
26
|
+
else
|
27
|
+
value
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def iso?(date)
|
34
|
+
date =~ /[012]\d\d\d-\d\d-\d\d/
|
35
|
+
end
|
36
|
+
|
37
|
+
def interval?(date)
|
38
|
+
date =~ /\/|\s\p{Pd}\s|(\s([12]?\d|30)\p{Pd}([12]?\d|3[01])?)/
|
39
|
+
end
|
40
|
+
|
41
|
+
def unknown?(date)
|
42
|
+
date =~ /inconnue|unknown|unbekannt|[ns]\. ?d\b|no date/i
|
43
|
+
end
|
44
|
+
|
45
|
+
def uncertain?(date)
|
46
|
+
date =~ /\?/
|
47
|
+
end
|
48
|
+
|
49
|
+
def approximate?(date)
|
50
|
+
date =~ /(\b(circa|ca\.|vers|approx))|(^[cv]\.)/i
|
51
|
+
end
|
52
|
+
|
53
|
+
def extract_uncertainty(date)
|
54
|
+
if approximate?(date)
|
55
|
+
uncertain?(date) ? '%' : '~'
|
56
|
+
else
|
57
|
+
uncertain?(date) ? '?' : nil
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def extract_year(date)
|
62
|
+
if date =~ /\D?([012]\d\d\d)\D?/
|
63
|
+
$1
|
64
|
+
else
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def extract_day(date)
|
70
|
+
if date =~ /\b([012]?\d|3[01])\b/
|
71
|
+
'%02d' % $1.to_i
|
72
|
+
else
|
73
|
+
nil
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def extract_month_by_name(date)
|
78
|
+
case date
|
79
|
+
when /\bjan/i
|
80
|
+
'01'
|
81
|
+
when /\bf(eb|év)/i
|
82
|
+
'02'
|
83
|
+
when /\bmar/i
|
84
|
+
'03'
|
85
|
+
when /\ba[pv]r/i
|
86
|
+
'04'
|
87
|
+
when /\bma[yi]/i
|
88
|
+
'05'
|
89
|
+
when /\bjui?n/i
|
90
|
+
'06'
|
91
|
+
when /\bjui?l/i
|
92
|
+
'07'
|
93
|
+
when /\ba(ug|oût)/i
|
94
|
+
'08'
|
95
|
+
when /\bsep/i
|
96
|
+
'09'
|
97
|
+
when /\bo[ck]t/i
|
98
|
+
'10'
|
99
|
+
when /\bnov/i
|
100
|
+
'11'
|
101
|
+
when /\bd[eé]c/i
|
102
|
+
'12'
|
103
|
+
else
|
104
|
+
nil
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|