anystyle 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/HISTORY.md +78 -0
- data/LICENSE +27 -0
- data/README.md +103 -0
- data/lib/anystyle.rb +71 -0
- data/lib/anystyle/dictionary.rb +132 -0
- data/lib/anystyle/dictionary/gdbm.rb +52 -0
- data/lib/anystyle/dictionary/lmdb.rb +67 -0
- data/lib/anystyle/dictionary/marshal.rb +27 -0
- data/lib/anystyle/dictionary/redis.rb +55 -0
- data/lib/anystyle/document.rb +264 -0
- data/lib/anystyle/errors.rb +14 -0
- data/lib/anystyle/feature.rb +27 -0
- data/lib/anystyle/feature/affix.rb +43 -0
- data/lib/anystyle/feature/brackets.rb +32 -0
- data/lib/anystyle/feature/canonical.rb +13 -0
- data/lib/anystyle/feature/caps.rb +20 -0
- data/lib/anystyle/feature/category.rb +70 -0
- data/lib/anystyle/feature/dictionary.rb +16 -0
- data/lib/anystyle/feature/indent.rb +16 -0
- data/lib/anystyle/feature/keyword.rb +52 -0
- data/lib/anystyle/feature/line.rb +39 -0
- data/lib/anystyle/feature/locator.rb +18 -0
- data/lib/anystyle/feature/number.rb +39 -0
- data/lib/anystyle/feature/position.rb +28 -0
- data/lib/anystyle/feature/punctuation.rb +22 -0
- data/lib/anystyle/feature/quotes.rb +20 -0
- data/lib/anystyle/feature/ref.rb +21 -0
- data/lib/anystyle/feature/terminal.rb +19 -0
- data/lib/anystyle/feature/words.rb +74 -0
- data/lib/anystyle/finder.rb +94 -0
- data/lib/anystyle/format/bibtex.rb +63 -0
- data/lib/anystyle/format/csl.rb +28 -0
- data/lib/anystyle/normalizer.rb +65 -0
- data/lib/anystyle/normalizer/brackets.rb +13 -0
- data/lib/anystyle/normalizer/container.rb +13 -0
- data/lib/anystyle/normalizer/date.rb +109 -0
- data/lib/anystyle/normalizer/edition.rb +16 -0
- data/lib/anystyle/normalizer/journal.rb +14 -0
- data/lib/anystyle/normalizer/locale.rb +30 -0
- data/lib/anystyle/normalizer/location.rb +24 -0
- data/lib/anystyle/normalizer/locator.rb +22 -0
- data/lib/anystyle/normalizer/names.rb +88 -0
- data/lib/anystyle/normalizer/page.rb +29 -0
- data/lib/anystyle/normalizer/publisher.rb +18 -0
- data/lib/anystyle/normalizer/pubmed.rb +18 -0
- data/lib/anystyle/normalizer/punctuation.rb +23 -0
- data/lib/anystyle/normalizer/quotes.rb +14 -0
- data/lib/anystyle/normalizer/type.rb +54 -0
- data/lib/anystyle/normalizer/volume.rb +26 -0
- data/lib/anystyle/parser.rb +199 -0
- data/lib/anystyle/support.rb +4 -0
- data/lib/anystyle/support/finder.mod +3234 -0
- data/lib/anystyle/support/finder.txt +75 -0
- data/lib/anystyle/support/parser.mod +15025 -0
- data/lib/anystyle/support/parser.txt +75 -0
- data/lib/anystyle/utils.rb +70 -0
- data/lib/anystyle/version.rb +3 -0
- data/res/finder/bb132pr2055.ttx +6803 -0
- data/res/finder/bb550sh8053.ttx +18660 -0
- data/res/finder/bb599nz4341.ttx +2957 -0
- data/res/finder/bb725rt6501.ttx +15276 -0
- data/res/finder/bc605xz1554.ttx +18815 -0
- data/res/finder/bd040gx5718.ttx +4271 -0
- data/res/finder/bd413nt2715.ttx +4956 -0
- data/res/finder/bd466fq0394.ttx +6100 -0
- data/res/finder/bf668vw2021.ttx +3578 -0
- data/res/finder/bg495cx0468.ttx +7267 -0
- data/res/finder/bg599vt3743.ttx +6752 -0
- data/res/finder/bg608dx2253.ttx +4094 -0
- data/res/finder/bh410qk3771.ttx +8785 -0
- data/res/finder/bh989ww6442.ttx +17204 -0
- data/res/finder/bj581pc8202.ttx +2719 -0
- data/res/parser/bad.xml +5199 -0
- data/res/parser/core.xml +7924 -0
- data/res/parser/gold.xml +2707 -0
- data/res/parser/good.xml +34281 -0
- data/res/parser/stanford-books.xml +2280 -0
- data/res/parser/stanford-diss.xml +726 -0
- data/res/parser/stanford-theses.xml +4684 -0
- data/res/parser/ugly.xml +33246 -0
- metadata +195 -0
@@ -0,0 +1,16 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
class Edition < Normalizer
|
4
|
+
@keys = [:edition]
|
5
|
+
|
6
|
+
def normalize(item, **opts)
|
7
|
+
map_values(item) do |_, value|
|
8
|
+
value
|
9
|
+
.gsub(/rev\./, 'revised')
|
10
|
+
.gsub(/([eé]d(\.|ition)?|ausg(\.|abe)?)$/i, '')
|
11
|
+
.strip
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
class Journal < Normalizer
|
4
|
+
def normalize(item, **opts)
|
5
|
+
if item.key?(:journal)
|
6
|
+
item[:type] = 'article-journal'
|
7
|
+
item[:journal].each { |journal| append item, :'container-title', journal }
|
8
|
+
item.delete(:journal)
|
9
|
+
end
|
10
|
+
item
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
maybe_require 'language_detector'
|
3
|
+
|
4
|
+
class Normalizer
|
5
|
+
class Locale < Normalizer
|
6
|
+
def initialize
|
7
|
+
@ld = LanguageDetector.new if defined?(LanguageDetector)
|
8
|
+
end
|
9
|
+
|
10
|
+
def normalize(item, **opts)
|
11
|
+
return item if @ld.nil? || item.key?(:language)
|
12
|
+
|
13
|
+
sample = item.values_at(
|
14
|
+
:title,
|
15
|
+
:'container-title',
|
16
|
+
# :'collection-title',
|
17
|
+
:location,
|
18
|
+
:journal,
|
19
|
+
:publisher
|
20
|
+
# :note
|
21
|
+
).flatten.compact.join(' ')
|
22
|
+
|
23
|
+
return item if sample.empty?
|
24
|
+
|
25
|
+
item[:language] = @ld.detect(sample)
|
26
|
+
item
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
class Location < Normalizer
|
4
|
+
@keys = [:location]
|
5
|
+
|
6
|
+
def normalize(item, **opts)
|
7
|
+
map_values(item) do |_, value|
|
8
|
+
location = strip value
|
9
|
+
|
10
|
+
if !item.key?(:publisher) && location.include?(':')
|
11
|
+
location, publisher = location.split(/\s*:\s*/)
|
12
|
+
item[:publisher] = publisher
|
13
|
+
end
|
14
|
+
|
15
|
+
location
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def strip(string)
|
20
|
+
string.gsub(/^\p{^Alnum}+|\p{^Alnum}+$/, '')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
class Normalizer
|
5
|
+
class Locator < Normalizer
|
6
|
+
@keys = [:isbn, :url]
|
7
|
+
|
8
|
+
def normalize(item, **opts)
|
9
|
+
map_values(item) do |key, value|
|
10
|
+
case key
|
11
|
+
when :isbn
|
12
|
+
value[/[\d-]+/]
|
13
|
+
when :url
|
14
|
+
URI.extract(value)
|
15
|
+
else
|
16
|
+
value
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
require 'namae'
|
3
|
+
|
4
|
+
class Normalizer
|
5
|
+
class Names < Normalizer
|
6
|
+
@keys = [
|
7
|
+
:author, :editor, :translator, :director, :producer
|
8
|
+
]
|
9
|
+
|
10
|
+
attr_accessor :namae
|
11
|
+
|
12
|
+
def initialize(**opts)
|
13
|
+
super(**opts)
|
14
|
+
|
15
|
+
@namae = Namae::Parser.new({
|
16
|
+
prefer_comma_as_separator: true,
|
17
|
+
separator: /\A(and|AND|&|;|und|UND|y|e)\s+/,
|
18
|
+
appellation: /\A(?!x)x/,
|
19
|
+
title: /\A(?!x)x/
|
20
|
+
})
|
21
|
+
end
|
22
|
+
|
23
|
+
def normalize(item, prev: [], **opts)
|
24
|
+
map_values(item) do |key, value|
|
25
|
+
value.gsub!(/(^[\(\[]|[,;:\)\]]+$)/, '')
|
26
|
+
case
|
27
|
+
when repeater?(value) && prev.length > 0
|
28
|
+
prev[-1][key][0] || prev[-1][:author][0]
|
29
|
+
else
|
30
|
+
begin
|
31
|
+
parse(strip(value))
|
32
|
+
rescue
|
33
|
+
[{ literal: value }]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def repeater?(value)
|
40
|
+
value =~ /^[\p{P}\s]+$/
|
41
|
+
end
|
42
|
+
|
43
|
+
def strip(value)
|
44
|
+
value
|
45
|
+
.gsub(/^[Ii]n:?\s+/, '')
|
46
|
+
.gsub(/\b[EÉeé]d(s?\.|itors?\.?|ited|iteurs?|ité)(\s+(by|par)\s+|\b|$)/, '')
|
47
|
+
.gsub(/\b([Hh](rsg|gg?)\.|Herausgeber)\s+/, '')
|
48
|
+
.gsub(/\b[Hh]erausgegeben von\s+/, '')
|
49
|
+
.gsub(/\b((d|ein)er )?[Üü]ber(s\.|setzt|setzung|tragen|tragung) v(\.|on)\s+/, '')
|
50
|
+
.gsub(/\b[Tt]rans(l?\.|lated|lation)(\s+by\b)?\s*/, '')
|
51
|
+
.gsub(/\b[Tt]rad(ucteurs?|(uit|\.)(\s+par\b)?)\s*/, '')
|
52
|
+
.gsub(/\b([Dd]ir(\.|ected))(\s+by)?\s+/, '')
|
53
|
+
.gsub(/\b([Pp]rod(\.|uce[rd]))(\s+by)?\s+/, '')
|
54
|
+
.gsub(/\b([Pp]erf(\.|orme[rd]))(\s+by)?\s+/, '')
|
55
|
+
.gsub(/\*/, '')
|
56
|
+
.gsub(/\([^\)]*\)?/, '')
|
57
|
+
.gsub(/\[[^\]]*\)?/, '')
|
58
|
+
.gsub(/[;:]/, ',')
|
59
|
+
.gsub(/^\p{^L}+|\s+\p{^L}+$/, '')
|
60
|
+
.gsub(/[\s,\.]+$/, '')
|
61
|
+
.gsub(/,{2,}/, ',')
|
62
|
+
.gsub(/\s+\./, '.')
|
63
|
+
end
|
64
|
+
|
65
|
+
def parse(value)
|
66
|
+
raise ArgumentError if value.empty?
|
67
|
+
|
68
|
+
others = value.sub!(
|
69
|
+
/(,\s+)?((\&\s+)?\bet\s+(al|coll)\b|\bu\.\s*a\b|(\band|\&)\s+others).*$/, ''
|
70
|
+
) || value.sub!(/\.\.\.|…/, '')
|
71
|
+
|
72
|
+
# Add surname/initial punctuation separator for Vancouver-style names
|
73
|
+
# E.g. Rang HP, Dale MM, Ritter JM, Moore PK
|
74
|
+
if value.match(/^(\p{Lu}[^\s,.]+)\s+([\p{Lu}][\p{Lu}\-]{0,3})(,|[.]?$)/)
|
75
|
+
value.gsub!(/\b(\p{Lu}[^\s,.]+)\s+([\p{Lu}][\p{Lu}\-]{0,3})(,|[.]?$)/, '\1, \2\3')
|
76
|
+
end
|
77
|
+
|
78
|
+
names = namae.parse!(value).map { |name|
|
79
|
+
name.normalize_initials
|
80
|
+
name.to_h.reject { |_, v| v.nil? }
|
81
|
+
}
|
82
|
+
|
83
|
+
names << { others: true } unless others.nil?
|
84
|
+
names
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
class Page < Normalizer
|
4
|
+
@keys = [:pages]
|
5
|
+
|
6
|
+
def normalize(item, **opts)
|
7
|
+
map_values(item) do |_, value|
|
8
|
+
pages = case value
|
9
|
+
when /(\d+)(?:\.(\d+))?(?:\((\d{4})\))?:(\d.*)/
|
10
|
+
# "volume.issue(year):pp"
|
11
|
+
append(item, :volume, $1.to_i)
|
12
|
+
append(item, :issue, $2.to_i) unless $2.nil?
|
13
|
+
append(item, :year, $3.to_i) unless $3.nil?
|
14
|
+
$4
|
15
|
+
else
|
16
|
+
value
|
17
|
+
end
|
18
|
+
|
19
|
+
# TODO chap. 5, pp. 195-234.
|
20
|
+
|
21
|
+
pages
|
22
|
+
.gsub(/\p{Pd}+/, '–')
|
23
|
+
.gsub(/[^\d,–]+/, ' ')
|
24
|
+
.strip
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
class Publisher < Normalizer
|
4
|
+
@keys = [:publisher]
|
5
|
+
|
6
|
+
def normalize(item, **opts)
|
7
|
+
replace_author(item) if item.key?(:author)
|
8
|
+
item
|
9
|
+
end
|
10
|
+
|
11
|
+
def replace_author(item)
|
12
|
+
each_value(item) do |_, value|
|
13
|
+
value.gsub!(/^Author$/, item[:author][0])
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
class PubMed < Normalizer
|
4
|
+
@keys = [:note]
|
5
|
+
|
6
|
+
def normalize(item, **opts)
|
7
|
+
each_value(item) do |_, value|
|
8
|
+
if (value =~ /PMID:?\s*(\d+)/)
|
9
|
+
append item, :pmid, $1
|
10
|
+
end
|
11
|
+
if (value =~ /PMC(\d+)/)
|
12
|
+
append item, :pmcid, $1
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
class Punctuation < Normalizer
|
4
|
+
@keys = [
|
5
|
+
:'container-title',
|
6
|
+
:'collection-title',
|
7
|
+
:date,
|
8
|
+
:edition,
|
9
|
+
:journal,
|
10
|
+
:location,
|
11
|
+
:publisher,
|
12
|
+
:title
|
13
|
+
]
|
14
|
+
|
15
|
+
def normalize(item, **opts)
|
16
|
+
each_value(item) do |_, value|
|
17
|
+
value.gsub!(/[\)\]\.,:;\p{Pd}\p{Z}\p{C}]+$/, '')
|
18
|
+
value.gsub!(/^[\(\[]/, '')
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
class Quotes < Normalizer
|
4
|
+
QUOTES = /^[«‹»›„‚“‟‘‛”’"❛❜❟❝❞⹂〝〞〟\[]|[«‹»›„‚“‟‘‛”’"❛❜❟❝❞⹂〝〞〟\]]$/
|
5
|
+
@keys = [:title, :'citation-number', :medium]
|
6
|
+
|
7
|
+
def normalize(item, **opts)
|
8
|
+
each_value(item) do |_, value|
|
9
|
+
value.gsub! QUOTES, ''
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
class Type < Normalizer
|
4
|
+
def normalize(item, **opts)
|
5
|
+
item[:type] = classify item unless item.key?(:type)
|
6
|
+
item
|
7
|
+
end
|
8
|
+
|
9
|
+
def classify(item)
|
10
|
+
keys = item.keys
|
11
|
+
|
12
|
+
case
|
13
|
+
when keys.include?(:'container-title')
|
14
|
+
case
|
15
|
+
when keys.include?(:issue)
|
16
|
+
'article-journal'
|
17
|
+
when item[:'container-title'].to_s =~ /proceedings|proc\.|conference|meeting|symposi(on|um)/i
|
18
|
+
'paper-conference'
|
19
|
+
when item[:'container-title'].to_s =~ /journal|zeitschrift|quarterly|review|revue/i
|
20
|
+
'article-journal'
|
21
|
+
else
|
22
|
+
'chapter'
|
23
|
+
end
|
24
|
+
when keys.include?(:genre)
|
25
|
+
case item[:genre].to_s
|
26
|
+
when /ph(\.\s*)?d|diss(\.|ertation)|thesis/i
|
27
|
+
'thesis'
|
28
|
+
when /rep(\.|ort)/i
|
29
|
+
'report'
|
30
|
+
when /unpublished|manuscript/i
|
31
|
+
'manuscript'
|
32
|
+
when /patent/i
|
33
|
+
'patent'
|
34
|
+
when /personal communication/i
|
35
|
+
'personal_communication'
|
36
|
+
when /interview/i
|
37
|
+
'interview'
|
38
|
+
when /web|online|en ligne/
|
39
|
+
'webpage'
|
40
|
+
end
|
41
|
+
when keys.include?(:medium)
|
42
|
+
case item[:medium].to_s
|
43
|
+
when /dvd|video|vhs|motion/i
|
44
|
+
'motion_picture'
|
45
|
+
when /television/i
|
46
|
+
'broadcast'
|
47
|
+
end
|
48
|
+
when keys.include?(:publisher)
|
49
|
+
'book'
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class Normalizer
|
3
|
+
class Volume < Normalizer
|
4
|
+
@keys = [:volume, :pages, :date]
|
5
|
+
|
6
|
+
def normalize(item, **opts)
|
7
|
+
map_values(item, [:volume]) do |_, volume|
|
8
|
+
case volume
|
9
|
+
when /(\p{Lu}?\d+)\s?\(([^)]+)\)/
|
10
|
+
append item, :issue, $2
|
11
|
+
$1
|
12
|
+
when /(?:(\p{Lu}?\d+)[\p{P}\s]+)?(?:nos?|nr|n°|nº|iss?)\.?\s?(.+)$/i
|
13
|
+
volume = $1
|
14
|
+
append item, :issue, $2.sub(/\p{P}$/, '')
|
15
|
+
volume
|
16
|
+
else
|
17
|
+
volume
|
18
|
+
.sub(/^[\p{P}\s]+/, '')
|
19
|
+
.sub(/.*vol(ume)?[\p{P}\s]+/i, '')
|
20
|
+
.sub(/\p{P}$/, '')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
module AnyStyle
|
2
|
+
class ParserCore
|
3
|
+
include StringUtils
|
4
|
+
|
5
|
+
class << self
|
6
|
+
attr_reader :defaults, :formats
|
7
|
+
|
8
|
+
def load(path)
|
9
|
+
new :model => path
|
10
|
+
end
|
11
|
+
|
12
|
+
# Returns a default parser instance
|
13
|
+
def instance
|
14
|
+
Thread.current["anystyle_#{name.downcase}"] ||= new
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
attr_reader :model, :options, :features, :normalizers
|
19
|
+
|
20
|
+
def initialize(options = {})
|
21
|
+
@options = self.class.defaults.merge(options)
|
22
|
+
load_model
|
23
|
+
end
|
24
|
+
|
25
|
+
def load_model(file = options[:model])
|
26
|
+
unless file.nil?
|
27
|
+
@model = Wapiti.load(file)
|
28
|
+
@model.options.update_attributes options
|
29
|
+
else
|
30
|
+
@model = Wapiti::Model.new(options.reject { |k,_| k == :model })
|
31
|
+
@model.path = options[:model]
|
32
|
+
end
|
33
|
+
|
34
|
+
self
|
35
|
+
end
|
36
|
+
|
37
|
+
def label(input, **opts)
|
38
|
+
model.label prepare(input, **opts)
|
39
|
+
end
|
40
|
+
|
41
|
+
def check(input)
|
42
|
+
model.check prepare(input, tagged: true)
|
43
|
+
end
|
44
|
+
|
45
|
+
def train(input = options[:training_data], truncate: true)
|
46
|
+
load_model(nil) if truncate
|
47
|
+
unless input.nil? || input.empty?
|
48
|
+
model.train prepare(input, tagged: true)
|
49
|
+
end
|
50
|
+
model
|
51
|
+
end
|
52
|
+
|
53
|
+
def learn(input)
|
54
|
+
train(input, truncate: false)
|
55
|
+
end
|
56
|
+
|
57
|
+
def normalize(hash, **opts)
|
58
|
+
normalizers.each do |n|
|
59
|
+
begin
|
60
|
+
hash = n.normalize(hash, **opts) unless n.skip?
|
61
|
+
rescue => e
|
62
|
+
warn "Error in #{n.name} normalizer: #{e.message}"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
hash
|
66
|
+
end
|
67
|
+
|
68
|
+
def expand(dataset)
|
69
|
+
raise NotImplementedError
|
70
|
+
end
|
71
|
+
|
72
|
+
def prepare(input, **opts)
|
73
|
+
case input
|
74
|
+
when Wapiti::Dataset
|
75
|
+
expand input
|
76
|
+
when Wapiti::Sequence
|
77
|
+
expand Wapiti::Dataset.new([input])
|
78
|
+
when String
|
79
|
+
if !input.tainted? && input.length < 1024 && File.exists?(input)
|
80
|
+
expand Wapiti::Dataset.open(input, opts)
|
81
|
+
else
|
82
|
+
expand Wapiti::Dataset.parse(input, opts)
|
83
|
+
end
|
84
|
+
else
|
85
|
+
expand Wapiti::Dataset.parse(input, opts)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
class Parser < ParserCore
|
92
|
+
include Format::BibTeX
|
93
|
+
include Format::CSL
|
94
|
+
|
95
|
+
@formats = [:bibtex, :citeproc, :csl, :hash, :wapiti]
|
96
|
+
|
97
|
+
@defaults = {
|
98
|
+
model: File.join(SUPPORT, 'parser.mod'),
|
99
|
+
pattern: File.join(SUPPORT, 'parser.txt'),
|
100
|
+
compact: true,
|
101
|
+
threads: 4,
|
102
|
+
separator: /(?:\r?\n)+/,
|
103
|
+
delimiter: /\s+/,
|
104
|
+
format: :hash,
|
105
|
+
training_data: File.join(RES, 'parser', 'core.xml')
|
106
|
+
}
|
107
|
+
|
108
|
+
def initialize(options = {})
|
109
|
+
super(options)
|
110
|
+
|
111
|
+
@features = [
|
112
|
+
Feature::Canonical.new,
|
113
|
+
Feature::Category.new,
|
114
|
+
Feature::Affix.new(size: 2),
|
115
|
+
Feature::Affix.new(size: 2, suffix: true),
|
116
|
+
Feature::Caps.new,
|
117
|
+
Feature::Number.new,
|
118
|
+
Feature::Dictionary.new(dictionary: options[:dictionary] || Dictionary.instance),
|
119
|
+
Feature::Keyword.new,
|
120
|
+
Feature::Position.new,
|
121
|
+
Feature::Punctuation.new,
|
122
|
+
Feature::Brackets.new,
|
123
|
+
Feature::Terminal.new,
|
124
|
+
Feature::Locator.new
|
125
|
+
]
|
126
|
+
|
127
|
+
@normalizers = [
|
128
|
+
Normalizer::Quotes.new,
|
129
|
+
Normalizer::Brackets.new,
|
130
|
+
Normalizer::Punctuation.new,
|
131
|
+
Normalizer::Journal.new,
|
132
|
+
Normalizer::Container.new,
|
133
|
+
Normalizer::Edition.new,
|
134
|
+
Normalizer::Volume.new,
|
135
|
+
Normalizer::Page.new,
|
136
|
+
Normalizer::Date.new,
|
137
|
+
Normalizer::Location.new,
|
138
|
+
Normalizer::Locator.new,
|
139
|
+
Normalizer::Publisher.new,
|
140
|
+
Normalizer::PubMed.new,
|
141
|
+
Normalizer::Names.new,
|
142
|
+
Normalizer::Locale.new,
|
143
|
+
Normalizer::Type.new
|
144
|
+
]
|
145
|
+
end
|
146
|
+
|
147
|
+
def expand(dataset)
|
148
|
+
dataset.each do |seq|
|
149
|
+
seq.tokens.each_with_index do |tok, idx|
|
150
|
+
alpha = scrub tok.value
|
151
|
+
tok.observations = features.map { |f|
|
152
|
+
f.observe tok.value, alpha: alpha, idx: idx, seq: seq
|
153
|
+
}
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def format_hash(dataset, symbolize_keys: true)
|
159
|
+
dataset.inject([]) { |out, seq|
|
160
|
+
out << normalize(seq.to_h(symbolize_keys: symbolize_keys), prev: out)
|
161
|
+
}
|
162
|
+
end
|
163
|
+
|
164
|
+
def flatten_values(hash, skip: [], spacer: ' ')
|
165
|
+
hash.each_pair do |key, value|
|
166
|
+
unless !value.is_a?(Array) || skip.include?(key)
|
167
|
+
if value.length > 1 && value[0].respond_to?(:join)
|
168
|
+
hash[key] = value.join(spacer)
|
169
|
+
else
|
170
|
+
hash[key] = value[0]
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def rename_value(hash, name, new_name)
|
177
|
+
hash[new_name] = hash.delete name if hash.key?(name)
|
178
|
+
end
|
179
|
+
|
180
|
+
def parse(input, format: options[:format], **opts)
|
181
|
+
case format.to_sym
|
182
|
+
when :wapiti
|
183
|
+
label(input, **opts)
|
184
|
+
when :hash, :bibtex, :citeproc, :csl
|
185
|
+
formatter = "format_#{format}".to_sym
|
186
|
+
send(formatter, label(input, **opts), **opts)
|
187
|
+
else
|
188
|
+
raise ArgumentError, "format not supported: #{format}"
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def prepare(input, **opts)
|
193
|
+
opts[:separator] ||= options[:separator]
|
194
|
+
opts[:delimiter] ||= options[:delimiter]
|
195
|
+
input = input.join("\n") if input.is_a?(Array) && input[0].is_a?(String)
|
196
|
+
super(input, opts)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|