anystyle-parser 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -2
- data/HISTORY.md +6 -0
- data/anystyle-parser.gemspec +1 -1
- data/lib/anystyle/parser/features.rb +208 -208
- data/lib/anystyle/parser/normalizer.rb +359 -359
- data/lib/anystyle/parser/parser.rb +28 -10
- data/lib/anystyle/parser/support/anystyle.mod +32347 -5039
- data/lib/anystyle/parser/version.rb +1 -1
- data/spec/anystyle/parser/features_spec.rb +27 -21
- data/spec/anystyle/parser/normalizer_spec.rb +83 -62
- data/spec/anystyle/parser/parser_spec.rb +49 -6
- data/spec/fixtures/train_dps.txt +12 -0
- data/spec/spec_helper.rb +15 -3
- metadata +7 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 575f1accfff0d04c318a023de9bd4d1f8720ff98
|
4
|
+
data.tar.gz: aa53148f49bb5e26947375016cda44f9c0b97f6d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a962499569b1dfbf853392b447169498f4e7ad537d1ca7ef3b9bd1a0d20c893aa3c77bf5bc1794410cdea50390cce1e3fec3082b9c982c52e482a51d27615f1c
|
7
|
+
data.tar.gz: 92dac81d61b7cc52170f15396785187d0b5d99c1ea7819be1b03318fdd29d80faa669845010ff370fffc17b7ff4c2b5659ba95900b5c4be251cbf4e007e70756
|
data/Gemfile
CHANGED
data/HISTORY.md
CHANGED
data/anystyle-parser.gemspec
CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |s|
|
|
18
18
|
s.required_ruby_version = '>= 1.9.3'
|
19
19
|
|
20
20
|
s.add_runtime_dependency('bibtex-ruby', '~>3.0')
|
21
|
-
s.add_runtime_dependency('wapiti', '~>0.
|
21
|
+
s.add_runtime_dependency('wapiti', '~>0.1')
|
22
22
|
s.add_runtime_dependency('namae', '~>0.8')
|
23
23
|
|
24
24
|
s.files = `git ls-files`.split("\n").reject { |path|
|
@@ -1,211 +1,211 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
3
|
module Anystyle
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
end
|
4
|
+
module Parser
|
5
|
+
|
6
|
+
class Feature
|
7
|
+
|
8
|
+
@dict = Dictionary.instance
|
9
|
+
@instances = []
|
10
|
+
|
11
|
+
class << self
|
12
|
+
|
13
|
+
attr_reader :dict, :instances
|
14
|
+
|
15
|
+
def define(name, &block)
|
16
|
+
instances << new(name, block)
|
17
|
+
end
|
18
|
+
|
19
|
+
def undefine(name)
|
20
|
+
instances.reject! { |f| f.name == name }
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_accessor :name, :matcher
|
26
|
+
|
27
|
+
def initialize(name, matcher)
|
28
|
+
@name, @matcher = name, matcher
|
29
|
+
end
|
30
|
+
|
31
|
+
def match(*arguments)
|
32
|
+
matcher.call(*arguments)
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
# Is the the last character upper-/lowercase, numeric or something else?
|
39
|
+
# Returns A, a, 0 or the last character itself.
|
40
|
+
Feature.define :last_character do |token, stripped, sequence, offset|
|
41
|
+
case char = token.split(//)[-1]
|
42
|
+
when /^[[:upper:]]$/
|
43
|
+
:upper
|
44
|
+
when /^[[:lower:]]$/
|
45
|
+
:lower
|
46
|
+
when /^\d$/
|
47
|
+
:numeric
|
48
|
+
else
|
49
|
+
char
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Sequences of the first four characters
|
54
|
+
Feature.define :first do |token, stripped, sequence, offset|
|
55
|
+
c = token.split(//)[0,4]
|
56
|
+
(0..3).map { |i| c[0..i].join }
|
57
|
+
end
|
58
|
+
|
59
|
+
# Sequences of the last four characters
|
60
|
+
Feature.define :last do |token, stripped, sequence, offset|
|
61
|
+
c = token.split(//).reverse[0,4]
|
62
|
+
(0..3).map { |i| c[0..i].reverse.join }
|
63
|
+
end
|
64
|
+
|
65
|
+
Feature.define :stripped_lowercase do |token, stripped, sequence, offset|
|
66
|
+
stripped.empty? ? :EMPTY : stripped.downcase
|
67
|
+
end
|
68
|
+
|
69
|
+
Feature.define :capitalization do |token, stripped, sequence, offset|
|
70
|
+
case stripped
|
71
|
+
when /^[[:upper:]]$/
|
72
|
+
:single
|
73
|
+
when /^[[:upper:]][[:lower:]]/
|
74
|
+
:initial
|
75
|
+
when /^[[:upper:]]+$/
|
76
|
+
:all
|
77
|
+
else
|
78
|
+
:other
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
Feature.define :numbers do |token, stripped, sequence, offset|
|
83
|
+
case token
|
84
|
+
when /\d\(\d+([—–-]\d+)?\)/
|
85
|
+
:volume
|
86
|
+
when /^\(\d{4}\)[^[:alnum:]]*$/, /^(1\d{3}|20\d{2})[\.,;:]?$/
|
87
|
+
:year
|
88
|
+
when /\d{4}\s*[—–-]+\s*\d{4}/
|
89
|
+
:'year-range'
|
90
|
+
when /\d+\s*[—–-]+\s*\d+/, /^[^[:alnum:]]*pp?\.\d*[^[:alnum:]]*$/, /^((pp?|s)\.?|pages?)$/i
|
91
|
+
:page
|
92
|
+
when /^\d$/
|
93
|
+
:single
|
94
|
+
when /^\d{2}$/
|
95
|
+
:double
|
96
|
+
when /^\d{3}$/
|
97
|
+
:triple
|
98
|
+
when /^\d+$/
|
99
|
+
:digits
|
100
|
+
when /^\d+[\d-]+$/
|
101
|
+
:serial
|
102
|
+
when /^-\d+$/
|
103
|
+
:negative
|
104
|
+
when /\d+(th|st|nd|rd)[^[:alnum:]]*/i
|
105
|
+
:ordinal
|
106
|
+
when /\d/
|
107
|
+
:numeric
|
108
|
+
else
|
109
|
+
:none
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
Feature.define :dictionary do |token, stripped, sequence, offset|
|
114
|
+
c = Feature.dict[stripped.downcase]
|
115
|
+
f = Dictionary.keys.map do |k|
|
116
|
+
c & Dictionary.code[k] > 0 ? k : ['no',k].join('-').to_sym
|
117
|
+
end
|
118
|
+
f.unshift(c)
|
119
|
+
end
|
120
|
+
|
121
|
+
# TODO sequence features should be called just once per sequence
|
122
|
+
# TODO improve / disambiguate edition
|
123
|
+
Feature.define :editors do |token, stripped, sequence, offest|
|
124
|
+
sequence.any? { |t| t =~ /^(ed|editor|editors|eds|edited|hrsg)$/i } ? :editors : :'no-editors'
|
125
|
+
end
|
126
|
+
|
127
|
+
# TODO Translated
|
128
|
+
|
129
|
+
Feature.define :location do |token, stripped, sequence, offset|
|
130
|
+
((offset.to_f / sequence.length) * 10).round
|
131
|
+
end
|
132
|
+
|
133
|
+
Feature.define :punctuation do |token, stripped, sequence, offset|
|
134
|
+
case token
|
135
|
+
when /^["'”’´‘“`]/
|
136
|
+
:quote
|
137
|
+
when /["'”’´‘“`][!\?\.]$/
|
138
|
+
:'terminal-unquote'
|
139
|
+
when /["'”’´‘“`][,;:-]$/
|
140
|
+
:'internal-unquote'
|
141
|
+
when /["'”’´‘“`]$/
|
142
|
+
:unquote
|
143
|
+
when /^[\[\{].*[\}\]][!\?\.,;:-]?$/
|
144
|
+
:braces
|
145
|
+
when /^<.*>[!\?\.,;:-]?$/
|
146
|
+
:tags
|
147
|
+
when /^[\(].*[\)][!\?\.]$/
|
148
|
+
:'terminal-parens'
|
149
|
+
when /^\(.*\)[,;:-]$/
|
150
|
+
:'internal-parens'
|
151
|
+
when /^\(.*\)$/
|
152
|
+
:parens
|
153
|
+
when /^[\[\{]/
|
154
|
+
:'opening-brace'
|
155
|
+
when /[\}\]][!\?\.,;:-]?$/
|
156
|
+
:'closing-brace'
|
157
|
+
when /^</
|
158
|
+
:'opening-tag'
|
159
|
+
when />[!\?\.,;:-]?$/
|
160
|
+
:'closing-tag'
|
161
|
+
when /^\(/
|
162
|
+
:'opening-parens'
|
163
|
+
when /\)[,;:-]$/
|
164
|
+
:'internal-closing-parens'
|
165
|
+
when /^\)$/
|
166
|
+
:'closing-parens'
|
167
|
+
when /[,;:-]$/
|
168
|
+
:internal
|
169
|
+
when /[!\?\."']$/
|
170
|
+
:terminal
|
171
|
+
when /^\d{2,5}\(\d{2,5}\).?$/
|
172
|
+
:volume
|
173
|
+
when /-+/
|
174
|
+
:hyphen
|
175
|
+
else
|
176
|
+
:others
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
|
181
|
+
Feature.define :type do |token, stripped, sequence, offset|
|
182
|
+
s = sequence.join(' ')
|
183
|
+
case
|
184
|
+
when s =~ /dissertation abstract/i
|
185
|
+
:dissertaion
|
186
|
+
when s =~ /proceeding/i
|
187
|
+
:proceedings
|
188
|
+
when stripped =~ /^in$/i && sequence[offset+1].to_s =~ /^[[:upper:]]/ && sequence[offset-1].to_s =~ /["'”’´‘“`\.;,]$/
|
189
|
+
:collection
|
190
|
+
else
|
191
|
+
:other
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
Feature.define :reference do |token, stripped, sequence, offset|
|
196
|
+
case token
|
197
|
+
when /retrieved/i
|
198
|
+
:retrieved
|
199
|
+
when /isbn/i
|
200
|
+
:isbn
|
201
|
+
when /^doi:/i
|
202
|
+
:doi
|
203
|
+
when /^url|http|www\.[\w\.]+/i
|
204
|
+
:url
|
205
|
+
else
|
206
|
+
:none
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
end
|
211
|
+
end
|