anystyle 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/HISTORY.md +78 -0
- data/LICENSE +27 -0
- data/README.md +103 -0
- data/lib/anystyle.rb +71 -0
- data/lib/anystyle/dictionary.rb +132 -0
- data/lib/anystyle/dictionary/gdbm.rb +52 -0
- data/lib/anystyle/dictionary/lmdb.rb +67 -0
- data/lib/anystyle/dictionary/marshal.rb +27 -0
- data/lib/anystyle/dictionary/redis.rb +55 -0
- data/lib/anystyle/document.rb +264 -0
- data/lib/anystyle/errors.rb +14 -0
- data/lib/anystyle/feature.rb +27 -0
- data/lib/anystyle/feature/affix.rb +43 -0
- data/lib/anystyle/feature/brackets.rb +32 -0
- data/lib/anystyle/feature/canonical.rb +13 -0
- data/lib/anystyle/feature/caps.rb +20 -0
- data/lib/anystyle/feature/category.rb +70 -0
- data/lib/anystyle/feature/dictionary.rb +16 -0
- data/lib/anystyle/feature/indent.rb +16 -0
- data/lib/anystyle/feature/keyword.rb +52 -0
- data/lib/anystyle/feature/line.rb +39 -0
- data/lib/anystyle/feature/locator.rb +18 -0
- data/lib/anystyle/feature/number.rb +39 -0
- data/lib/anystyle/feature/position.rb +28 -0
- data/lib/anystyle/feature/punctuation.rb +22 -0
- data/lib/anystyle/feature/quotes.rb +20 -0
- data/lib/anystyle/feature/ref.rb +21 -0
- data/lib/anystyle/feature/terminal.rb +19 -0
- data/lib/anystyle/feature/words.rb +74 -0
- data/lib/anystyle/finder.rb +94 -0
- data/lib/anystyle/format/bibtex.rb +63 -0
- data/lib/anystyle/format/csl.rb +28 -0
- data/lib/anystyle/normalizer.rb +65 -0
- data/lib/anystyle/normalizer/brackets.rb +13 -0
- data/lib/anystyle/normalizer/container.rb +13 -0
- data/lib/anystyle/normalizer/date.rb +109 -0
- data/lib/anystyle/normalizer/edition.rb +16 -0
- data/lib/anystyle/normalizer/journal.rb +14 -0
- data/lib/anystyle/normalizer/locale.rb +30 -0
- data/lib/anystyle/normalizer/location.rb +24 -0
- data/lib/anystyle/normalizer/locator.rb +22 -0
- data/lib/anystyle/normalizer/names.rb +88 -0
- data/lib/anystyle/normalizer/page.rb +29 -0
- data/lib/anystyle/normalizer/publisher.rb +18 -0
- data/lib/anystyle/normalizer/pubmed.rb +18 -0
- data/lib/anystyle/normalizer/punctuation.rb +23 -0
- data/lib/anystyle/normalizer/quotes.rb +14 -0
- data/lib/anystyle/normalizer/type.rb +54 -0
- data/lib/anystyle/normalizer/volume.rb +26 -0
- data/lib/anystyle/parser.rb +199 -0
- data/lib/anystyle/support.rb +4 -0
- data/lib/anystyle/support/finder.mod +3234 -0
- data/lib/anystyle/support/finder.txt +75 -0
- data/lib/anystyle/support/parser.mod +15025 -0
- data/lib/anystyle/support/parser.txt +75 -0
- data/lib/anystyle/utils.rb +70 -0
- data/lib/anystyle/version.rb +3 -0
- data/res/finder/bb132pr2055.ttx +6803 -0
- data/res/finder/bb550sh8053.ttx +18660 -0
- data/res/finder/bb599nz4341.ttx +2957 -0
- data/res/finder/bb725rt6501.ttx +15276 -0
- data/res/finder/bc605xz1554.ttx +18815 -0
- data/res/finder/bd040gx5718.ttx +4271 -0
- data/res/finder/bd413nt2715.ttx +4956 -0
- data/res/finder/bd466fq0394.ttx +6100 -0
- data/res/finder/bf668vw2021.ttx +3578 -0
- data/res/finder/bg495cx0468.ttx +7267 -0
- data/res/finder/bg599vt3743.ttx +6752 -0
- data/res/finder/bg608dx2253.ttx +4094 -0
- data/res/finder/bh410qk3771.ttx +8785 -0
- data/res/finder/bh989ww6442.ttx +17204 -0
- data/res/finder/bj581pc8202.ttx +2719 -0
- data/res/parser/bad.xml +5199 -0
- data/res/parser/core.xml +7924 -0
- data/res/parser/gold.xml +2707 -0
- data/res/parser/good.xml +34281 -0
- data/res/parser/stanford-books.xml +2280 -0
- data/res/parser/stanford-diss.xml +726 -0
- data/res/parser/stanford-theses.xml +4684 -0
- data/res/parser/ugly.xml +33246 -0
- metadata +195 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Ref < Feature
|
4
|
+
def observe(token, **opts)
|
5
|
+
[
|
6
|
+
symbolize(count(token, /\b(1\d|20)\d\d\b/)),
|
7
|
+
symbolize(count(token, /(\d[\(:;]\d)|(\d\s*\p{Pd}+\s*\d)|\bpp?\.|\bvols?\.|\bnos?\./i)),
|
8
|
+
symbolize(count(token, /\b\p{Lu}\./)),
|
9
|
+
symbolize(count(token, /\b(eds?\.|edited by|editors?|hg|hrsg|et al)\b/i)),
|
10
|
+
token =~ /^\s*(\[\w+\]|\(\d+\)|\d+\.)\s+/ ? 'T' : 'F'
|
11
|
+
]
|
12
|
+
end
|
13
|
+
|
14
|
+
def symbolize(k)
|
15
|
+
return '-' if k < 1
|
16
|
+
return '+' if k < 3
|
17
|
+
return '++'
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Terminal < Feature
|
4
|
+
def observe(token, **opts)
|
5
|
+
case token
|
6
|
+
when /[\.\)\]]["'”„’‚´«‘“`»」』\)\]]?$/,
|
7
|
+
/,["'”„’‚´«‘“`»」』\)\]]|["'”„’‚´«‘“`»」』\)\]],$/
|
8
|
+
:strong
|
9
|
+
when /[:"'”„’‚´«‘“`»」』][,;:\p{Pd}!\?\.]?$/
|
10
|
+
:moderate
|
11
|
+
when /[!\?,;\p{Pd}]["'”„’‚´«‘“`»」』]?$/
|
12
|
+
:weak
|
13
|
+
else
|
14
|
+
:none
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Feature
|
3
|
+
class Words < Feature
|
4
|
+
attr_reader :dictionary
|
5
|
+
|
6
|
+
TITLE_WORDS = %w{
|
7
|
+
abstract
|
8
|
+
acknowledgements
|
9
|
+
appendix
|
10
|
+
bibliography
|
11
|
+
bibliographie
|
12
|
+
chapter
|
13
|
+
cited
|
14
|
+
contents
|
15
|
+
figures
|
16
|
+
introduction
|
17
|
+
literatur
|
18
|
+
literature
|
19
|
+
references
|
20
|
+
referenzen
|
21
|
+
section
|
22
|
+
tables
|
23
|
+
works
|
24
|
+
}
|
25
|
+
|
26
|
+
def initialize(dictionary:, **opts)
|
27
|
+
super(**opts)
|
28
|
+
@dictionary = dictionary
|
29
|
+
end
|
30
|
+
|
31
|
+
def observe(token, **opts)
|
32
|
+
words = token.scan(/\S+/).map { |word| canonize word }.reject(&:empty?)
|
33
|
+
spacers = token.scan(/\S\s\s+\S/)
|
34
|
+
numbers = token.scan(/\d+(\.\d+)?/)
|
35
|
+
title = words.count { |word| TITLE_WORDS.include?(word) }
|
36
|
+
counts = dictionary.tag_counts(words)
|
37
|
+
|
38
|
+
if words.length > 0
|
39
|
+
len = words.map(&:length).sort
|
40
|
+
avg = len.reduce(0, :+) / len.length
|
41
|
+
med = len.length.even? ?
|
42
|
+
len[(len.length - 1) / 2, 2].reduce(0, :+) / 2 :
|
43
|
+
len[len.length / 2]
|
44
|
+
else
|
45
|
+
avg, med = 0, 0
|
46
|
+
end
|
47
|
+
|
48
|
+
[
|
49
|
+
words.length,
|
50
|
+
avg,
|
51
|
+
med,
|
52
|
+
spacers.length,
|
53
|
+
classify(words[0]),
|
54
|
+
numbers.length,
|
55
|
+
ratio(title, words.length),
|
56
|
+
*counts.map { |cnt| ratio(cnt, words.length) }
|
57
|
+
]
|
58
|
+
end
|
59
|
+
|
60
|
+
def classify(word)
|
61
|
+
case word
|
62
|
+
when /^(\d+|[vx]?iii?|i?[vx]|)$/i
|
63
|
+
:number
|
64
|
+
when /\d/
|
65
|
+
:numeric
|
66
|
+
when nil
|
67
|
+
:none
|
68
|
+
else
|
69
|
+
:alpha
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Finder < ParserCore
|
3
|
+
@formats = [:hash, :references, :wapiti]
|
4
|
+
|
5
|
+
@defaults = {
|
6
|
+
model: File.join(SUPPORT, 'finder.mod'),
|
7
|
+
pattern: File.join(SUPPORT, 'finder.txt'),
|
8
|
+
compact: true,
|
9
|
+
threads: 4,
|
10
|
+
format: :references,
|
11
|
+
training_data: Dir[File.join(RES, 'finder', '*.ttx')].map(&:untaint)
|
12
|
+
}
|
13
|
+
|
14
|
+
def initialize(options = {})
|
15
|
+
super(options)
|
16
|
+
|
17
|
+
@features = [
|
18
|
+
Feature::Line.new,
|
19
|
+
Feature::Category.new(strip: true),
|
20
|
+
Feature::Words.new(dictionary: options[:dictionary] || Dictionary.instance),
|
21
|
+
Feature::Indent.new,
|
22
|
+
Feature::Ref.new,
|
23
|
+
Feature::Position.new(seq: :page, idx: :ln),
|
24
|
+
Feature::Position.new(seq: :pages, idx: :pn)
|
25
|
+
]
|
26
|
+
end
|
27
|
+
|
28
|
+
def expand(dataset)
|
29
|
+
dataset.each do |doc|
|
30
|
+
doc.each.with_index do |(line, ln, page, pn), idx|
|
31
|
+
line.observations = features.map.with_index { |f, fn|
|
32
|
+
f.observe line.value,
|
33
|
+
page: page,
|
34
|
+
pages: doc.pages,
|
35
|
+
seq: doc,
|
36
|
+
pn: pn,
|
37
|
+
ln: ln,
|
38
|
+
fn: fn,
|
39
|
+
idx: idx
|
40
|
+
}.flatten
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def find(input, format: options[:format], **opts)
|
46
|
+
case format.to_sym
|
47
|
+
when :references, :ref
|
48
|
+
format_references(label(input, **opts), **opts)
|
49
|
+
when :hash
|
50
|
+
format_hash(label(input, **opts), **opts)
|
51
|
+
when :wapiti
|
52
|
+
label(input, **opts)
|
53
|
+
else
|
54
|
+
raise ArgumentError, "unknown format '#{format}'"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def format_hash(dataset, **opts)
|
59
|
+
dataset.map { |doc| doc.to_h(**opts) }
|
60
|
+
end
|
61
|
+
|
62
|
+
def format_references(dataset, **opts)
|
63
|
+
dataset.map { |doc| doc.references(**opts) }
|
64
|
+
end
|
65
|
+
|
66
|
+
def label(input, layout: true, **opts)
|
67
|
+
dataset = prepare(input, layout: layout, **opts)
|
68
|
+
output = model.label(dataset, **opts)
|
69
|
+
Wapiti::Dataset.new(dataset.map.with_index { |doc, idx|
|
70
|
+
doc.label(output[idx])
|
71
|
+
})
|
72
|
+
end
|
73
|
+
|
74
|
+
def prepare(input, layout: true, **opts)
|
75
|
+
case input
|
76
|
+
when String
|
77
|
+
super(Document.open(input, layout: layout, **opts), **opts)
|
78
|
+
when Array
|
79
|
+
super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **opts) }), **opts)
|
80
|
+
else
|
81
|
+
super(input, **opts)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def save_each(dataset, dir: '.', tagged: false, **opts)
|
86
|
+
dataset.each.with_index do |doc, idx|
|
87
|
+
name = doc.path.nil? ? idx : File.basename(doc.path, File.extname(doc.path))
|
88
|
+
file = "#{name}.#{tagged ? 'ttx' : 'txt'}"
|
89
|
+
File.write(File.join(dir, file), doc.to_s(tagged: tagged, **opts))
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
module Format
|
3
|
+
module BibTeX
|
4
|
+
TYPES = {
|
5
|
+
'article-journal' => 'article',
|
6
|
+
'chapter' => 'incollection',
|
7
|
+
'manuscript' => 'unpublished',
|
8
|
+
'paper-conference' => 'inproceedings',
|
9
|
+
'report' => 'techreport'
|
10
|
+
}
|
11
|
+
|
12
|
+
def format_bibtex(dataset, **opts)
|
13
|
+
require 'bibtex'
|
14
|
+
|
15
|
+
b = ::BibTeX::Bibliography.new
|
16
|
+
format_hash(dataset).each do |hash|
|
17
|
+
flatten_values hash, skip: Normalizer::Names.keys
|
18
|
+
|
19
|
+
hash[:bibtex_type] = TYPES[hash[:type]] || hash[:type] || 'misc'
|
20
|
+
hash.delete :type
|
21
|
+
|
22
|
+
case hash[:bibtex_type]
|
23
|
+
when 'article'
|
24
|
+
rename_value hash, :'container-title', :journal
|
25
|
+
rename_value hash, :issue, :number
|
26
|
+
when 'techreport'
|
27
|
+
rename_value hash, :publisher, :institution
|
28
|
+
when 'thesis'
|
29
|
+
rename_value hash, :publisher, :school
|
30
|
+
end
|
31
|
+
|
32
|
+
Normalizer::Names.keys.each do |role|
|
33
|
+
names_to_bibtex hash, role
|
34
|
+
end
|
35
|
+
|
36
|
+
rename_value hash, :'collection-title', :series
|
37
|
+
rename_value hash, :'container-title', :booktitle
|
38
|
+
rename_value hash, :accessed, :urldate
|
39
|
+
rename_value hash, :genre, :type
|
40
|
+
rename_value hash, :location, :address
|
41
|
+
|
42
|
+
b << ::BibTeX::Entry.new(hash)
|
43
|
+
end
|
44
|
+
b
|
45
|
+
end
|
46
|
+
|
47
|
+
def names_to_bibtex(hash, role)
|
48
|
+
if hash.key?(role)
|
49
|
+
hash[role] = hash[role].map { |name|
|
50
|
+
case
|
51
|
+
when name.key?(:literal)
|
52
|
+
name[:literal]
|
53
|
+
when name.key?(:family) || name.key?(:given)
|
54
|
+
name.values_at(:family, :suffix, :given).compact.join(', ')
|
55
|
+
else
|
56
|
+
nil
|
57
|
+
end
|
58
|
+
}.compact.join(' and ')
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
module Format
|
3
|
+
module CSL
|
4
|
+
def format_csl(dataset, **opts)
|
5
|
+
format_hash(dataset).map do |hash|
|
6
|
+
flatten_values hash, skip: Normalizer::Names.keys
|
7
|
+
|
8
|
+
rename_value hash, :pages, :page
|
9
|
+
rename_value hash, :location, :'publisher-place'
|
10
|
+
rename_value hash, :url, :URL
|
11
|
+
rename_value hash, :doi, :DOI
|
12
|
+
rename_value hash, :pmid, :PMID
|
13
|
+
rename_value hash, :pmcid, :PMCID
|
14
|
+
|
15
|
+
Normalizer::Names.keys.each do |role|
|
16
|
+
if hash.key?(role)
|
17
|
+
hash[role].reject! { |name| name[:others] }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
hash
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
alias_method :format_citeproc, :format_csl
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
@keys = []
|
4
|
+
|
5
|
+
class << self
|
6
|
+
attr_reader :keys
|
7
|
+
end
|
8
|
+
|
9
|
+
attr_reader :keys
|
10
|
+
attr_accessor :skip
|
11
|
+
|
12
|
+
def initialize(keys: self.class.keys)
|
13
|
+
@keys = keys
|
14
|
+
@skip = false
|
15
|
+
end
|
16
|
+
|
17
|
+
def name
|
18
|
+
self.class.name
|
19
|
+
end
|
20
|
+
|
21
|
+
def normalize(item, **opts)
|
22
|
+
raise NotImplementedError
|
23
|
+
end
|
24
|
+
|
25
|
+
def append(item, key, value)
|
26
|
+
if item.key?(key)
|
27
|
+
item[key] << value
|
28
|
+
else
|
29
|
+
item[key] = [value]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def each_value(item, keys = keys_for(item))
|
34
|
+
keys.each do |key|
|
35
|
+
item[key].each do |value|
|
36
|
+
yield key, value
|
37
|
+
end if item.key?(key)
|
38
|
+
end
|
39
|
+
item
|
40
|
+
end
|
41
|
+
|
42
|
+
def map_values(item, keys = keys_for(item))
|
43
|
+
keys.each do |key|
|
44
|
+
if item.key?(key)
|
45
|
+
item[key] = item[key].map { |value|
|
46
|
+
yield key, value
|
47
|
+
}.flatten.reject { |v| v.nil? || v.empty? }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
item
|
51
|
+
end
|
52
|
+
|
53
|
+
def keys_for(item)
|
54
|
+
if self.class.keys.empty?
|
55
|
+
item.keys
|
56
|
+
else
|
57
|
+
self.class.keys
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def skip?
|
62
|
+
@skip
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
class Date < Normalizer
|
4
|
+
@keys = [:date]
|
5
|
+
|
6
|
+
def normalize(item, **opts)
|
7
|
+
map_values(item) do |_, value|
|
8
|
+
case
|
9
|
+
when unknown?(value)
|
10
|
+
'XXXX'
|
11
|
+
when interval?(value)
|
12
|
+
value
|
13
|
+
# TODO AD/BC
|
14
|
+
# TODO Seasons
|
15
|
+
when iso?(value)
|
16
|
+
value
|
17
|
+
else
|
18
|
+
year = extract_year(value)
|
19
|
+
unless year.nil?
|
20
|
+
month = extract_month_by_name(value)
|
21
|
+
day = extract_day(value) unless month.nil?
|
22
|
+
[
|
23
|
+
[year, month, day].compact.join('-'),
|
24
|
+
extract_uncertainty(value)
|
25
|
+
].compact.join('')
|
26
|
+
else
|
27
|
+
value
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def iso?(date)
|
34
|
+
date =~ /[012]\d\d\d-\d\d-\d\d/
|
35
|
+
end
|
36
|
+
|
37
|
+
def interval?(date)
|
38
|
+
date =~ /\/|\s\p{Pd}\s|(\s([12]?\d|30)\p{Pd}([12]?\d|3[01])?)/
|
39
|
+
end
|
40
|
+
|
41
|
+
def unknown?(date)
|
42
|
+
date =~ /inconnue|unknown|unbekannt|[ns]\. ?d\b|no date/i
|
43
|
+
end
|
44
|
+
|
45
|
+
def uncertain?(date)
|
46
|
+
date =~ /\?/
|
47
|
+
end
|
48
|
+
|
49
|
+
def approximate?(date)
|
50
|
+
date =~ /(\b(circa|ca\.|vers|approx))|(^[cv]\.)/i
|
51
|
+
end
|
52
|
+
|
53
|
+
def extract_uncertainty(date)
|
54
|
+
if approximate?(date)
|
55
|
+
uncertain?(date) ? '%' : '~'
|
56
|
+
else
|
57
|
+
uncertain?(date) ? '?' : nil
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def extract_year(date)
|
62
|
+
if date =~ /\D?([012]\d\d\d)\D?/
|
63
|
+
$1
|
64
|
+
else
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def extract_day(date)
|
70
|
+
if date =~ /\b([012]?\d|3[01])\b/
|
71
|
+
'%02d' % $1.to_i
|
72
|
+
else
|
73
|
+
nil
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def extract_month_by_name(date)
|
78
|
+
case date
|
79
|
+
when /\bjan/i
|
80
|
+
'01'
|
81
|
+
when /\bf(eb|év)/i
|
82
|
+
'02'
|
83
|
+
when /\bmar/i
|
84
|
+
'03'
|
85
|
+
when /\ba[pv]r/i
|
86
|
+
'04'
|
87
|
+
when /\bma[yi]/i
|
88
|
+
'05'
|
89
|
+
when /\bjui?n/i
|
90
|
+
'06'
|
91
|
+
when /\bjui?l/i
|
92
|
+
'07'
|
93
|
+
when /\ba(ug|oût)/i
|
94
|
+
'08'
|
95
|
+
when /\bsep/i
|
96
|
+
'09'
|
97
|
+
when /\bo[ck]t/i
|
98
|
+
'10'
|
99
|
+
when /\bnov/i
|
100
|
+
'11'
|
101
|
+
when /\bd[eé]c/i
|
102
|
+
'12'
|
103
|
+
else
|
104
|
+
nil
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|