anystyle-parser 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -2
- data/HISTORY.md +6 -0
- data/anystyle-parser.gemspec +1 -1
- data/lib/anystyle/parser/features.rb +208 -208
- data/lib/anystyle/parser/normalizer.rb +359 -359
- data/lib/anystyle/parser/parser.rb +28 -10
- data/lib/anystyle/parser/support/anystyle.mod +32347 -5039
- data/lib/anystyle/parser/version.rb +1 -1
- data/spec/anystyle/parser/features_spec.rb +27 -21
- data/spec/anystyle/parser/normalizer_spec.rb +83 -62
- data/spec/anystyle/parser/parser_spec.rb +49 -6
- data/spec/fixtures/train_dps.txt +12 -0
- data/spec/spec_helper.rb +15 -3
- metadata +7 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 575f1accfff0d04c318a023de9bd4d1f8720ff98
|
4
|
+
data.tar.gz: aa53148f49bb5e26947375016cda44f9c0b97f6d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a962499569b1dfbf853392b447169498f4e7ad537d1ca7ef3b9bd1a0d20c893aa3c77bf5bc1794410cdea50390cce1e3fec3082b9c982c52e482a51d27615f1c
|
7
|
+
data.tar.gz: 92dac81d61b7cc52170f15396785187d0b5d99c1ea7819be1b03318fdd29d80faa669845010ff370fffc17b7ff4c2b5659ba95900b5c4be251cbf4e007e70756
|
data/Gemfile
CHANGED
data/HISTORY.md
CHANGED
data/anystyle-parser.gemspec
CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |s|
|
|
18
18
|
s.required_ruby_version = '>= 1.9.3'
|
19
19
|
|
20
20
|
s.add_runtime_dependency('bibtex-ruby', '~>3.0')
|
21
|
-
s.add_runtime_dependency('wapiti', '~>0.
|
21
|
+
s.add_runtime_dependency('wapiti', '~>0.1')
|
22
22
|
s.add_runtime_dependency('namae', '~>0.8')
|
23
23
|
|
24
24
|
s.files = `git ls-files`.split("\n").reject { |path|
|
@@ -1,211 +1,211 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
3
|
module Anystyle
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
end
|
4
|
+
module Parser
|
5
|
+
|
6
|
+
class Feature
|
7
|
+
|
8
|
+
@dict = Dictionary.instance
|
9
|
+
@instances = []
|
10
|
+
|
11
|
+
class << self
|
12
|
+
|
13
|
+
attr_reader :dict, :instances
|
14
|
+
|
15
|
+
def define(name, &block)
|
16
|
+
instances << new(name, block)
|
17
|
+
end
|
18
|
+
|
19
|
+
def undefine(name)
|
20
|
+
instances.reject! { |f| f.name == name }
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_accessor :name, :matcher
|
26
|
+
|
27
|
+
def initialize(name, matcher)
|
28
|
+
@name, @matcher = name, matcher
|
29
|
+
end
|
30
|
+
|
31
|
+
def match(*arguments)
|
32
|
+
matcher.call(*arguments)
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
# Is the the last character upper-/lowercase, numeric or something else?
|
39
|
+
# Returns A, a, 0 or the last character itself.
|
40
|
+
Feature.define :last_character do |token, stripped, sequence, offset|
|
41
|
+
case char = token.split(//)[-1]
|
42
|
+
when /^[[:upper:]]$/
|
43
|
+
:upper
|
44
|
+
when /^[[:lower:]]$/
|
45
|
+
:lower
|
46
|
+
when /^\d$/
|
47
|
+
:numeric
|
48
|
+
else
|
49
|
+
char
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Sequences of the first four characters
|
54
|
+
Feature.define :first do |token, stripped, sequence, offset|
|
55
|
+
c = token.split(//)[0,4]
|
56
|
+
(0..3).map { |i| c[0..i].join }
|
57
|
+
end
|
58
|
+
|
59
|
+
# Sequences of the last four characters
|
60
|
+
Feature.define :last do |token, stripped, sequence, offset|
|
61
|
+
c = token.split(//).reverse[0,4]
|
62
|
+
(0..3).map { |i| c[0..i].reverse.join }
|
63
|
+
end
|
64
|
+
|
65
|
+
Feature.define :stripped_lowercase do |token, stripped, sequence, offset|
|
66
|
+
stripped.empty? ? :EMPTY : stripped.downcase
|
67
|
+
end
|
68
|
+
|
69
|
+
Feature.define :capitalization do |token, stripped, sequence, offset|
|
70
|
+
case stripped
|
71
|
+
when /^[[:upper:]]$/
|
72
|
+
:single
|
73
|
+
when /^[[:upper:]][[:lower:]]/
|
74
|
+
:initial
|
75
|
+
when /^[[:upper:]]+$/
|
76
|
+
:all
|
77
|
+
else
|
78
|
+
:other
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
Feature.define :numbers do |token, stripped, sequence, offset|
|
83
|
+
case token
|
84
|
+
when /\d\(\d+([—–-]\d+)?\)/
|
85
|
+
:volume
|
86
|
+
when /^\(\d{4}\)[^[:alnum:]]*$/, /^(1\d{3}|20\d{2})[\.,;:]?$/
|
87
|
+
:year
|
88
|
+
when /\d{4}\s*[—–-]+\s*\d{4}/
|
89
|
+
:'year-range'
|
90
|
+
when /\d+\s*[—–-]+\s*\d+/, /^[^[:alnum:]]*pp?\.\d*[^[:alnum:]]*$/, /^((pp?|s)\.?|pages?)$/i
|
91
|
+
:page
|
92
|
+
when /^\d$/
|
93
|
+
:single
|
94
|
+
when /^\d{2}$/
|
95
|
+
:double
|
96
|
+
when /^\d{3}$/
|
97
|
+
:triple
|
98
|
+
when /^\d+$/
|
99
|
+
:digits
|
100
|
+
when /^\d+[\d-]+$/
|
101
|
+
:serial
|
102
|
+
when /^-\d+$/
|
103
|
+
:negative
|
104
|
+
when /\d+(th|st|nd|rd)[^[:alnum:]]*/i
|
105
|
+
:ordinal
|
106
|
+
when /\d/
|
107
|
+
:numeric
|
108
|
+
else
|
109
|
+
:none
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
Feature.define :dictionary do |token, stripped, sequence, offset|
|
114
|
+
c = Feature.dict[stripped.downcase]
|
115
|
+
f = Dictionary.keys.map do |k|
|
116
|
+
c & Dictionary.code[k] > 0 ? k : ['no',k].join('-').to_sym
|
117
|
+
end
|
118
|
+
f.unshift(c)
|
119
|
+
end
|
120
|
+
|
121
|
+
# TODO sequence features should be called just once per sequence
|
122
|
+
# TODO improve / disambiguate edition
|
123
|
+
Feature.define :editors do |token, stripped, sequence, offest|
|
124
|
+
sequence.any? { |t| t =~ /^(ed|editor|editors|eds|edited|hrsg)$/i } ? :editors : :'no-editors'
|
125
|
+
end
|
126
|
+
|
127
|
+
# TODO Translated
|
128
|
+
|
129
|
+
Feature.define :location do |token, stripped, sequence, offset|
|
130
|
+
((offset.to_f / sequence.length) * 10).round
|
131
|
+
end
|
132
|
+
|
133
|
+
Feature.define :punctuation do |token, stripped, sequence, offset|
|
134
|
+
case token
|
135
|
+
when /^["'”’´‘“`]/
|
136
|
+
:quote
|
137
|
+
when /["'”’´‘“`][!\?\.]$/
|
138
|
+
:'terminal-unquote'
|
139
|
+
when /["'”’´‘“`][,;:-]$/
|
140
|
+
:'internal-unquote'
|
141
|
+
when /["'”’´‘“`]$/
|
142
|
+
:unquote
|
143
|
+
when /^[\[\{].*[\}\]][!\?\.,;:-]?$/
|
144
|
+
:braces
|
145
|
+
when /^<.*>[!\?\.,;:-]?$/
|
146
|
+
:tags
|
147
|
+
when /^[\(].*[\)][!\?\.]$/
|
148
|
+
:'terminal-parens'
|
149
|
+
when /^\(.*\)[,;:-]$/
|
150
|
+
:'internal-parens'
|
151
|
+
when /^\(.*\)$/
|
152
|
+
:parens
|
153
|
+
when /^[\[\{]/
|
154
|
+
:'opening-brace'
|
155
|
+
when /[\}\]][!\?\.,;:-]?$/
|
156
|
+
:'closing-brace'
|
157
|
+
when /^</
|
158
|
+
:'opening-tag'
|
159
|
+
when />[!\?\.,;:-]?$/
|
160
|
+
:'closing-tag'
|
161
|
+
when /^\(/
|
162
|
+
:'opening-parens'
|
163
|
+
when /\)[,;:-]$/
|
164
|
+
:'internal-closing-parens'
|
165
|
+
when /^\)$/
|
166
|
+
:'closing-parens'
|
167
|
+
when /[,;:-]$/
|
168
|
+
:internal
|
169
|
+
when /[!\?\."']$/
|
170
|
+
:terminal
|
171
|
+
when /^\d{2,5}\(\d{2,5}\).?$/
|
172
|
+
:volume
|
173
|
+
when /-+/
|
174
|
+
:hyphen
|
175
|
+
else
|
176
|
+
:others
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
|
181
|
+
Feature.define :type do |token, stripped, sequence, offset|
|
182
|
+
s = sequence.join(' ')
|
183
|
+
case
|
184
|
+
when s =~ /dissertation abstract/i
|
185
|
+
:dissertaion
|
186
|
+
when s =~ /proceeding/i
|
187
|
+
:proceedings
|
188
|
+
when stripped =~ /^in$/i && sequence[offset+1].to_s =~ /^[[:upper:]]/ && sequence[offset-1].to_s =~ /["'”’´‘“`\.;,]$/
|
189
|
+
:collection
|
190
|
+
else
|
191
|
+
:other
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
Feature.define :reference do |token, stripped, sequence, offset|
|
196
|
+
case token
|
197
|
+
when /retrieved/i
|
198
|
+
:retrieved
|
199
|
+
when /isbn/i
|
200
|
+
:isbn
|
201
|
+
when /^doi:/i
|
202
|
+
:doi
|
203
|
+
when /^url|http|www\.[\w\.]+/i
|
204
|
+
:url
|
205
|
+
else
|
206
|
+
:none
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
end
|
211
|
+
end
|