anystyle-parser 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9723d7ea8fd46588c2487a72357c29fc9b1811ce
4
- data.tar.gz: bfe339c9e7ab8883cbef7d8ad4de1a6aba433b53
3
+ metadata.gz: 575f1accfff0d04c318a023de9bd4d1f8720ff98
4
+ data.tar.gz: aa53148f49bb5e26947375016cda44f9c0b97f6d
5
5
  SHA512:
6
- metadata.gz: 18591aa6d5ab49057b57308ff4a38b0e99aa07b45acf244998b47e51653b892ca442a5def2f3ed547f74c4cf338355070749cd31ca44dec08d41d21b2f23912b
7
- data.tar.gz: 2790d3bd5f4fa9a86aae3be03b3dbb9847816911a173186c898def47f28c243f3bf12ccdedb34e914bd5549f3bdaeed24fcc47c014b628f71b96dacc0d39846e
6
+ metadata.gz: a962499569b1dfbf853392b447169498f4e7ad537d1ca7ef3b9bd1a0d20c893aa3c77bf5bc1794410cdea50390cce1e3fec3082b9c982c52e482a51d27615f1c
7
+ data.tar.gz: 92dac81d61b7cc52170f15396785187d0b5d99c1ea7819be1b03318fdd29d80faa669845010ff370fffc17b7ff4c2b5659ba95900b5c4be251cbf4e007e70756
data/Gemfile CHANGED
@@ -17,8 +17,8 @@ group :debug do
17
17
  end
18
18
 
19
19
  group :profile do
20
- gem 'ruby-prof'
21
- gem 'gnuplot'
20
+ gem 'ruby-prof', :require => false, :platform => :mri
21
+ gem 'gnuplot', :require => false, :platform => :mri
22
22
  end
23
23
 
24
24
  group :extra do
data/HISTORY.md CHANGED
@@ -1,3 +1,9 @@
1
+ 0.4.0 / 2014-02-27
2
+ ==================
3
+ * Update wapiti
4
+ * Improve dash patterns
5
+ * Updated default model
6
+
1
7
  0.3.0 / 2014-02-14
2
8
  ==================
3
9
  * Update dependencies
@@ -18,7 +18,7 @@ Gem::Specification.new do |s|
18
18
  s.required_ruby_version = '>= 1.9.3'
19
19
 
20
20
  s.add_runtime_dependency('bibtex-ruby', '~>3.0')
21
- s.add_runtime_dependency('wapiti', '~>0.0')
21
+ s.add_runtime_dependency('wapiti', '~>0.1')
22
22
  s.add_runtime_dependency('namae', '~>0.8')
23
23
 
24
24
  s.files = `git ls-files`.split("\n").reject { |path|
@@ -1,211 +1,211 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module Anystyle
4
- module Parser
5
-
6
- class Feature
7
-
8
- @dict = Dictionary.instance
9
- @instances = []
10
-
11
- class << self
12
-
13
- attr_reader :dict, :instances
14
-
15
- def define(name, &block)
16
- instances << new(name, block)
17
- end
18
-
19
- def undefine(name)
20
- instances.reject! { |f| f.name == name }
21
- end
22
-
23
- end
24
-
25
- attr_accessor :name, :matcher
26
-
27
- def initialize(name, matcher)
28
- @name, @matcher = name, matcher
29
- end
30
-
31
- def match(*arguments)
32
- matcher.call(*arguments)
33
- end
34
-
35
- end
36
-
37
-
38
- # Is the the last character upper-/lowercase, numeric or something else?
39
- # Returns A, a, 0 or the last character itself.
40
- Feature.define :last_character do |token, stripped, sequence, offset|
41
- case char = token.split(//)[-1]
42
- when /^[[:upper:]]$/
43
- :upper
44
- when /^[[:lower:]]$/
45
- :lower
46
- when /^\d$/
47
- :numeric
48
- else
49
- char
50
- end
51
- end
52
-
53
- # Sequences of the first four characters
54
- Feature.define :first do |token, stripped, sequence, offset|
55
- c = token.split(//)[0,4]
56
- (0..3).map { |i| c[0..i].join }
57
- end
58
-
59
- # Sequences of the last four characters
60
- Feature.define :last do |token, stripped, sequence, offset|
61
- c = token.split(//).reverse[0,4]
62
- (0..3).map { |i| c[0..i].reverse.join }
63
- end
64
-
65
- Feature.define :stripped_lowercase do |token, stripped, sequence, offset|
66
- stripped.empty? ? :EMPTY : stripped.downcase
67
- end
68
-
69
- Feature.define :capitalization do |token, stripped, sequence, offset|
70
- case stripped
71
- when /^[[:upper:]]$/
72
- :single
73
- when /^[[:upper:]][[:lower:]]/
74
- :initial
75
- when /^[[:upper:]]+$/
76
- :all
77
- else
78
- :other
79
- end
80
- end
81
-
82
- Feature.define :numbers do |token, stripped, sequence, offset|
83
- case token
84
- when /\d\(\d+(-\d+)?\)/
85
- :volume
86
- when /^\(\d{4}\)[^[:alnum:]]*$/, /^(1\d{3}|20\d{2})[\.,;:]?$/
87
- :year
88
- when /\d{4}\s*--?\s*\d{4}/
89
- :'year-range'
90
- when /\d+\s*--?\s*\d+/, /^[^[:alnum:]]*pp?\.\d*[^[:alnum:]]*$/, /^((pp?|s)\.?|pages)$/i
91
- :page
92
- when /^\d$/
93
- :single
94
- when /^\d{2}$/
95
- :double
96
- when /^\d{3}$/
97
- :triple
98
- when /^\d+$/
99
- :digits
100
- when /^\d+[\d-]+$/
101
- :serial
102
- when /^-\d+$/
103
- :negative
104
- when /\d+(th|st|nd|rd)[^[:alnum:]]*/i
105
- :ordinal
106
- when /\d/
107
- :numeric
108
- else
109
- :none
110
- end
111
- end
112
-
113
- Feature.define :dictionary do |token, stripped, sequence, offset|
114
- c = Feature.dict[stripped.downcase]
115
- f = Dictionary.keys.map do |k|
116
- c & Dictionary.code[k] > 0 ? k : ['no',k].join('-').to_sym
117
- end
118
- f.unshift(c)
119
- end
120
-
121
- # TODO sequence features should be called just once per sequence
122
- # TODO improve / disambiguate edition
123
- Feature.define :editors do |token, stripped, sequence, offest|
124
- sequence.any? { |t| t =~ /^(ed|editor|editors|eds|edited|hrsg)$/i } ? :editors : :'no-editors'
125
- end
126
-
127
- # TODO Translated
128
-
129
- Feature.define :location do |token, stripped, sequence, offset|
130
- ((offset.to_f / sequence.length) * 10).round
131
- end
132
-
133
- Feature.define :punctuation do |token, stripped, sequence, offset|
134
- case token
135
- when /^["'”’´‘“`]/
136
- :quote
137
- when /["'”’´‘“`][!\?\.]$/
138
- :'terminal-unquote'
139
- when /["'”’´‘“`][,;:-]$/
140
- :'internal-unquote'
141
- when /["'”’´‘“`]$/
142
- :unquote
143
- when /^[\[\{].*[\}\]][!\?\.,;:-]?$/
144
- :braces
145
- when /^<.*>[!\?\.,;:-]?$/
146
- :tags
147
- when /^[\(].*[\)][!\?\.]$/
148
- :'terminal-parens'
149
- when /^\(.*\)[,;:-]$/
150
- :'internal-parens'
151
- when /^\(.*\)$/
152
- :parens
153
- when /^[\[\{]/
154
- :'opening-brace'
155
- when /[\}\]][!\?\.,;:-]?$/
156
- :'closing-brace'
157
- when /^</
158
- :'opening-tag'
159
- when />[!\?\.,;:-]?$/
160
- :'closing-tag'
161
- when /^\(/
162
- :'opening-parens'
163
- when /\)[,;:-]$/
164
- :'internal-closing-parens'
165
- when /^\)$/
166
- :'closing-parens'
167
- when /[,;:-]$/
168
- :internal
169
- when /[!\?\."']$/
170
- :terminal
171
- when /^\d{2,5}\(\d{2,5}\).?$/
172
- :volume
173
- when /-+/
174
- :hyphen
175
- else
176
- :others
177
- end
178
- end
179
-
180
-
181
- Feature.define :type do |token, stripped, sequence, offset|
182
- s = sequence.join(' ')
183
- case
184
- when s =~ /dissertation abstract/i
185
- :dissertaion
186
- when s =~ /proceeding/i
187
- :proceedings
188
- when stripped =~ /^in$/i && sequence[offset+1].to_s =~ /^[[:upper:]]/ && sequence[offset-1].to_s =~ /["'”’´‘“`\.;,]$/
189
- :collection
190
- else
191
- :other
192
- end
193
- end
194
-
195
- Feature.define :reference do |token, stripped, sequence, offset|
196
- case token
197
- when /retrieved/i
198
- :retrieved
199
- when /isbn/i
200
- :isbn
201
- when /^doi:/i
202
- :doi
203
- when /^url|http|www\.[\w\.]+/i
204
- :url
205
- else
206
- :none
207
- end
208
- end
209
-
210
- end
211
- end
4
+ module Parser
5
+
6
+ class Feature
7
+
8
+ @dict = Dictionary.instance
9
+ @instances = []
10
+
11
+ class << self
12
+
13
+ attr_reader :dict, :instances
14
+
15
+ def define(name, &block)
16
+ instances << new(name, block)
17
+ end
18
+
19
+ def undefine(name)
20
+ instances.reject! { |f| f.name == name }
21
+ end
22
+
23
+ end
24
+
25
+ attr_accessor :name, :matcher
26
+
27
+ def initialize(name, matcher)
28
+ @name, @matcher = name, matcher
29
+ end
30
+
31
+ def match(*arguments)
32
+ matcher.call(*arguments)
33
+ end
34
+
35
+ end
36
+
37
+
38
+ # Is the the last character upper-/lowercase, numeric or something else?
39
+ # Returns A, a, 0 or the last character itself.
40
+ Feature.define :last_character do |token, stripped, sequence, offset|
41
+ case char = token.split(//)[-1]
42
+ when /^[[:upper:]]$/
43
+ :upper
44
+ when /^[[:lower:]]$/
45
+ :lower
46
+ when /^\d$/
47
+ :numeric
48
+ else
49
+ char
50
+ end
51
+ end
52
+
53
+ # Sequences of the first four characters
54
+ Feature.define :first do |token, stripped, sequence, offset|
55
+ c = token.split(//)[0,4]
56
+ (0..3).map { |i| c[0..i].join }
57
+ end
58
+
59
+ # Sequences of the last four characters
60
+ Feature.define :last do |token, stripped, sequence, offset|
61
+ c = token.split(//).reverse[0,4]
62
+ (0..3).map { |i| c[0..i].reverse.join }
63
+ end
64
+
65
+ Feature.define :stripped_lowercase do |token, stripped, sequence, offset|
66
+ stripped.empty? ? :EMPTY : stripped.downcase
67
+ end
68
+
69
+ Feature.define :capitalization do |token, stripped, sequence, offset|
70
+ case stripped
71
+ when /^[[:upper:]]$/
72
+ :single
73
+ when /^[[:upper:]][[:lower:]]/
74
+ :initial
75
+ when /^[[:upper:]]+$/
76
+ :all
77
+ else
78
+ :other
79
+ end
80
+ end
81
+
82
+ Feature.define :numbers do |token, stripped, sequence, offset|
83
+ case token
84
+ when /\d\(\d+([—–-]\d+)?\)/
85
+ :volume
86
+ when /^\(\d{4}\)[^[:alnum:]]*$/, /^(1\d{3}|20\d{2})[\.,;:]?$/
87
+ :year
88
+ when /\d{4}\s*[—–-]+\s*\d{4}/
89
+ :'year-range'
90
+ when /\d+\s*[—–-]+\s*\d+/, /^[^[:alnum:]]*pp?\.\d*[^[:alnum:]]*$/, /^((pp?|s)\.?|pages?)$/i
91
+ :page
92
+ when /^\d$/
93
+ :single
94
+ when /^\d{2}$/
95
+ :double
96
+ when /^\d{3}$/
97
+ :triple
98
+ when /^\d+$/
99
+ :digits
100
+ when /^\d+[\d-]+$/
101
+ :serial
102
+ when /^-\d+$/
103
+ :negative
104
+ when /\d+(th|st|nd|rd)[^[:alnum:]]*/i
105
+ :ordinal
106
+ when /\d/
107
+ :numeric
108
+ else
109
+ :none
110
+ end
111
+ end
112
+
113
+ Feature.define :dictionary do |token, stripped, sequence, offset|
114
+ c = Feature.dict[stripped.downcase]
115
+ f = Dictionary.keys.map do |k|
116
+ c & Dictionary.code[k] > 0 ? k : ['no',k].join('-').to_sym
117
+ end
118
+ f.unshift(c)
119
+ end
120
+
121
+ # TODO sequence features should be called just once per sequence
122
+ # TODO improve / disambiguate edition
123
+ Feature.define :editors do |token, stripped, sequence, offest|
124
+ sequence.any? { |t| t =~ /^(ed|editor|editors|eds|edited|hrsg)$/i } ? :editors : :'no-editors'
125
+ end
126
+
127
+ # TODO Translated
128
+
129
+ Feature.define :location do |token, stripped, sequence, offset|
130
+ ((offset.to_f / sequence.length) * 10).round
131
+ end
132
+
133
+ Feature.define :punctuation do |token, stripped, sequence, offset|
134
+ case token
135
+ when /^["'”’´‘“`]/
136
+ :quote
137
+ when /["'”’´‘“`][!\?\.]$/
138
+ :'terminal-unquote'
139
+ when /["'”’´‘“`][,;:-]$/
140
+ :'internal-unquote'
141
+ when /["'”’´‘“`]$/
142
+ :unquote
143
+ when /^[\[\{].*[\}\]][!\?\.,;:-]?$/
144
+ :braces
145
+ when /^<.*>[!\?\.,;:-]?$/
146
+ :tags
147
+ when /^[\(].*[\)][!\?\.]$/
148
+ :'terminal-parens'
149
+ when /^\(.*\)[,;:-]$/
150
+ :'internal-parens'
151
+ when /^\(.*\)$/
152
+ :parens
153
+ when /^[\[\{]/
154
+ :'opening-brace'
155
+ when /[\}\]][!\?\.,;:-]?$/
156
+ :'closing-brace'
157
+ when /^</
158
+ :'opening-tag'
159
+ when />[!\?\.,;:-]?$/
160
+ :'closing-tag'
161
+ when /^\(/
162
+ :'opening-parens'
163
+ when /\)[,;:-]$/
164
+ :'internal-closing-parens'
165
+ when /^\)$/
166
+ :'closing-parens'
167
+ when /[,;:-]$/
168
+ :internal
169
+ when /[!\?\."']$/
170
+ :terminal
171
+ when /^\d{2,5}\(\d{2,5}\).?$/
172
+ :volume
173
+ when /-+/
174
+ :hyphen
175
+ else
176
+ :others
177
+ end
178
+ end
179
+
180
+
181
+ Feature.define :type do |token, stripped, sequence, offset|
182
+ s = sequence.join(' ')
183
+ case
184
+ when s =~ /dissertation abstract/i
185
+ :dissertaion
186
+ when s =~ /proceeding/i
187
+ :proceedings
188
+ when stripped =~ /^in$/i && sequence[offset+1].to_s =~ /^[[:upper:]]/ && sequence[offset-1].to_s =~ /["'”’´‘“`\.;,]$/
189
+ :collection
190
+ else
191
+ :other
192
+ end
193
+ end
194
+
195
+ Feature.define :reference do |token, stripped, sequence, offset|
196
+ case token
197
+ when /retrieved/i
198
+ :retrieved
199
+ when /isbn/i
200
+ :isbn
201
+ when /^doi:/i
202
+ :doi
203
+ when /^url|http|www\.[\w\.]+/i
204
+ :url
205
+ else
206
+ :none
207
+ end
208
+ end
209
+
210
+ end
211
+ end