anystyle-parser 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9723d7ea8fd46588c2487a72357c29fc9b1811ce
4
- data.tar.gz: bfe339c9e7ab8883cbef7d8ad4de1a6aba433b53
3
+ metadata.gz: 575f1accfff0d04c318a023de9bd4d1f8720ff98
4
+ data.tar.gz: aa53148f49bb5e26947375016cda44f9c0b97f6d
5
5
  SHA512:
6
- metadata.gz: 18591aa6d5ab49057b57308ff4a38b0e99aa07b45acf244998b47e51653b892ca442a5def2f3ed547f74c4cf338355070749cd31ca44dec08d41d21b2f23912b
7
- data.tar.gz: 2790d3bd5f4fa9a86aae3be03b3dbb9847816911a173186c898def47f28c243f3bf12ccdedb34e914bd5549f3bdaeed24fcc47c014b628f71b96dacc0d39846e
6
+ metadata.gz: a962499569b1dfbf853392b447169498f4e7ad537d1ca7ef3b9bd1a0d20c893aa3c77bf5bc1794410cdea50390cce1e3fec3082b9c982c52e482a51d27615f1c
7
+ data.tar.gz: 92dac81d61b7cc52170f15396785187d0b5d99c1ea7819be1b03318fdd29d80faa669845010ff370fffc17b7ff4c2b5659ba95900b5c4be251cbf4e007e70756
data/Gemfile CHANGED
@@ -17,8 +17,8 @@ group :debug do
17
17
  end
18
18
 
19
19
  group :profile do
20
- gem 'ruby-prof'
21
- gem 'gnuplot'
20
+ gem 'ruby-prof', :require => false, :platform => :mri
21
+ gem 'gnuplot', :require => false, :platform => :mri
22
22
  end
23
23
 
24
24
  group :extra do
data/HISTORY.md CHANGED
@@ -1,3 +1,9 @@
1
+ 0.4.0 / 2014-02-27
2
+ ==================
3
+ * Update wapiti
4
+ * Improve dash patterns
5
+ * Updated default model
6
+
1
7
  0.3.0 / 2014-02-14
2
8
  ==================
3
9
  * Update dependencies
@@ -18,7 +18,7 @@ Gem::Specification.new do |s|
18
18
  s.required_ruby_version = '>= 1.9.3'
19
19
 
20
20
  s.add_runtime_dependency('bibtex-ruby', '~>3.0')
21
- s.add_runtime_dependency('wapiti', '~>0.0')
21
+ s.add_runtime_dependency('wapiti', '~>0.1')
22
22
  s.add_runtime_dependency('namae', '~>0.8')
23
23
 
24
24
  s.files = `git ls-files`.split("\n").reject { |path|
@@ -1,211 +1,211 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module Anystyle
4
- module Parser
5
-
6
- class Feature
7
-
8
- @dict = Dictionary.instance
9
- @instances = []
10
-
11
- class << self
12
-
13
- attr_reader :dict, :instances
14
-
15
- def define(name, &block)
16
- instances << new(name, block)
17
- end
18
-
19
- def undefine(name)
20
- instances.reject! { |f| f.name == name }
21
- end
22
-
23
- end
24
-
25
- attr_accessor :name, :matcher
26
-
27
- def initialize(name, matcher)
28
- @name, @matcher = name, matcher
29
- end
30
-
31
- def match(*arguments)
32
- matcher.call(*arguments)
33
- end
34
-
35
- end
36
-
37
-
38
- # Is the the last character upper-/lowercase, numeric or something else?
39
- # Returns A, a, 0 or the last character itself.
40
- Feature.define :last_character do |token, stripped, sequence, offset|
41
- case char = token.split(//)[-1]
42
- when /^[[:upper:]]$/
43
- :upper
44
- when /^[[:lower:]]$/
45
- :lower
46
- when /^\d$/
47
- :numeric
48
- else
49
- char
50
- end
51
- end
52
-
53
- # Sequences of the first four characters
54
- Feature.define :first do |token, stripped, sequence, offset|
55
- c = token.split(//)[0,4]
56
- (0..3).map { |i| c[0..i].join }
57
- end
58
-
59
- # Sequences of the last four characters
60
- Feature.define :last do |token, stripped, sequence, offset|
61
- c = token.split(//).reverse[0,4]
62
- (0..3).map { |i| c[0..i].reverse.join }
63
- end
64
-
65
- Feature.define :stripped_lowercase do |token, stripped, sequence, offset|
66
- stripped.empty? ? :EMPTY : stripped.downcase
67
- end
68
-
69
- Feature.define :capitalization do |token, stripped, sequence, offset|
70
- case stripped
71
- when /^[[:upper:]]$/
72
- :single
73
- when /^[[:upper:]][[:lower:]]/
74
- :initial
75
- when /^[[:upper:]]+$/
76
- :all
77
- else
78
- :other
79
- end
80
- end
81
-
82
- Feature.define :numbers do |token, stripped, sequence, offset|
83
- case token
84
- when /\d\(\d+(-\d+)?\)/
85
- :volume
86
- when /^\(\d{4}\)[^[:alnum:]]*$/, /^(1\d{3}|20\d{2})[\.,;:]?$/
87
- :year
88
- when /\d{4}\s*--?\s*\d{4}/
89
- :'year-range'
90
- when /\d+\s*--?\s*\d+/, /^[^[:alnum:]]*pp?\.\d*[^[:alnum:]]*$/, /^((pp?|s)\.?|pages)$/i
91
- :page
92
- when /^\d$/
93
- :single
94
- when /^\d{2}$/
95
- :double
96
- when /^\d{3}$/
97
- :triple
98
- when /^\d+$/
99
- :digits
100
- when /^\d+[\d-]+$/
101
- :serial
102
- when /^-\d+$/
103
- :negative
104
- when /\d+(th|st|nd|rd)[^[:alnum:]]*/i
105
- :ordinal
106
- when /\d/
107
- :numeric
108
- else
109
- :none
110
- end
111
- end
112
-
113
- Feature.define :dictionary do |token, stripped, sequence, offset|
114
- c = Feature.dict[stripped.downcase]
115
- f = Dictionary.keys.map do |k|
116
- c & Dictionary.code[k] > 0 ? k : ['no',k].join('-').to_sym
117
- end
118
- f.unshift(c)
119
- end
120
-
121
- # TODO sequence features should be called just once per sequence
122
- # TODO improve / disambiguate edition
123
- Feature.define :editors do |token, stripped, sequence, offest|
124
- sequence.any? { |t| t =~ /^(ed|editor|editors|eds|edited|hrsg)$/i } ? :editors : :'no-editors'
125
- end
126
-
127
- # TODO Translated
128
-
129
- Feature.define :location do |token, stripped, sequence, offset|
130
- ((offset.to_f / sequence.length) * 10).round
131
- end
132
-
133
- Feature.define :punctuation do |token, stripped, sequence, offset|
134
- case token
135
- when /^["'”’´‘“`]/
136
- :quote
137
- when /["'”’´‘“`][!\?\.]$/
138
- :'terminal-unquote'
139
- when /["'”’´‘“`][,;:-]$/
140
- :'internal-unquote'
141
- when /["'”’´‘“`]$/
142
- :unquote
143
- when /^[\[\{].*[\}\]][!\?\.,;:-]?$/
144
- :braces
145
- when /^<.*>[!\?\.,;:-]?$/
146
- :tags
147
- when /^[\(].*[\)][!\?\.]$/
148
- :'terminal-parens'
149
- when /^\(.*\)[,;:-]$/
150
- :'internal-parens'
151
- when /^\(.*\)$/
152
- :parens
153
- when /^[\[\{]/
154
- :'opening-brace'
155
- when /[\}\]][!\?\.,;:-]?$/
156
- :'closing-brace'
157
- when /^</
158
- :'opening-tag'
159
- when />[!\?\.,;:-]?$/
160
- :'closing-tag'
161
- when /^\(/
162
- :'opening-parens'
163
- when /\)[,;:-]$/
164
- :'internal-closing-parens'
165
- when /^\)$/
166
- :'closing-parens'
167
- when /[,;:-]$/
168
- :internal
169
- when /[!\?\."']$/
170
- :terminal
171
- when /^\d{2,5}\(\d{2,5}\).?$/
172
- :volume
173
- when /-+/
174
- :hyphen
175
- else
176
- :others
177
- end
178
- end
179
-
180
-
181
- Feature.define :type do |token, stripped, sequence, offset|
182
- s = sequence.join(' ')
183
- case
184
- when s =~ /dissertation abstract/i
185
- :dissertaion
186
- when s =~ /proceeding/i
187
- :proceedings
188
- when stripped =~ /^in$/i && sequence[offset+1].to_s =~ /^[[:upper:]]/ && sequence[offset-1].to_s =~ /["'”’´‘“`\.;,]$/
189
- :collection
190
- else
191
- :other
192
- end
193
- end
194
-
195
- Feature.define :reference do |token, stripped, sequence, offset|
196
- case token
197
- when /retrieved/i
198
- :retrieved
199
- when /isbn/i
200
- :isbn
201
- when /^doi:/i
202
- :doi
203
- when /^url|http|www\.[\w\.]+/i
204
- :url
205
- else
206
- :none
207
- end
208
- end
209
-
210
- end
211
- end
4
+ module Parser
5
+
6
+ class Feature
7
+
8
+ @dict = Dictionary.instance
9
+ @instances = []
10
+
11
+ class << self
12
+
13
+ attr_reader :dict, :instances
14
+
15
+ def define(name, &block)
16
+ instances << new(name, block)
17
+ end
18
+
19
+ def undefine(name)
20
+ instances.reject! { |f| f.name == name }
21
+ end
22
+
23
+ end
24
+
25
+ attr_accessor :name, :matcher
26
+
27
+ def initialize(name, matcher)
28
+ @name, @matcher = name, matcher
29
+ end
30
+
31
+ def match(*arguments)
32
+ matcher.call(*arguments)
33
+ end
34
+
35
+ end
36
+
37
+
38
+ # Is the the last character upper-/lowercase, numeric or something else?
39
+ # Returns A, a, 0 or the last character itself.
40
+ Feature.define :last_character do |token, stripped, sequence, offset|
41
+ case char = token.split(//)[-1]
42
+ when /^[[:upper:]]$/
43
+ :upper
44
+ when /^[[:lower:]]$/
45
+ :lower
46
+ when /^\d$/
47
+ :numeric
48
+ else
49
+ char
50
+ end
51
+ end
52
+
53
+ # Sequences of the first four characters
54
+ Feature.define :first do |token, stripped, sequence, offset|
55
+ c = token.split(//)[0,4]
56
+ (0..3).map { |i| c[0..i].join }
57
+ end
58
+
59
+ # Sequences of the last four characters
60
+ Feature.define :last do |token, stripped, sequence, offset|
61
+ c = token.split(//).reverse[0,4]
62
+ (0..3).map { |i| c[0..i].reverse.join }
63
+ end
64
+
65
+ Feature.define :stripped_lowercase do |token, stripped, sequence, offset|
66
+ stripped.empty? ? :EMPTY : stripped.downcase
67
+ end
68
+
69
+ Feature.define :capitalization do |token, stripped, sequence, offset|
70
+ case stripped
71
+ when /^[[:upper:]]$/
72
+ :single
73
+ when /^[[:upper:]][[:lower:]]/
74
+ :initial
75
+ when /^[[:upper:]]+$/
76
+ :all
77
+ else
78
+ :other
79
+ end
80
+ end
81
+
82
+ Feature.define :numbers do |token, stripped, sequence, offset|
83
+ case token
84
+ when /\d\(\d+([—–-]\d+)?\)/
85
+ :volume
86
+ when /^\(\d{4}\)[^[:alnum:]]*$/, /^(1\d{3}|20\d{2})[\.,;:]?$/
87
+ :year
88
+ when /\d{4}\s*[—–-]+\s*\d{4}/
89
+ :'year-range'
90
+ when /\d+\s*[—–-]+\s*\d+/, /^[^[:alnum:]]*pp?\.\d*[^[:alnum:]]*$/, /^((pp?|s)\.?|pages?)$/i
91
+ :page
92
+ when /^\d$/
93
+ :single
94
+ when /^\d{2}$/
95
+ :double
96
+ when /^\d{3}$/
97
+ :triple
98
+ when /^\d+$/
99
+ :digits
100
+ when /^\d+[\d-]+$/
101
+ :serial
102
+ when /^-\d+$/
103
+ :negative
104
+ when /\d+(th|st|nd|rd)[^[:alnum:]]*/i
105
+ :ordinal
106
+ when /\d/
107
+ :numeric
108
+ else
109
+ :none
110
+ end
111
+ end
112
+
113
+ Feature.define :dictionary do |token, stripped, sequence, offset|
114
+ c = Feature.dict[stripped.downcase]
115
+ f = Dictionary.keys.map do |k|
116
+ c & Dictionary.code[k] > 0 ? k : ['no',k].join('-').to_sym
117
+ end
118
+ f.unshift(c)
119
+ end
120
+
121
+ # TODO sequence features should be called just once per sequence
122
+ # TODO improve / disambiguate edition
123
+ Feature.define :editors do |token, stripped, sequence, offest|
124
+ sequence.any? { |t| t =~ /^(ed|editor|editors|eds|edited|hrsg)$/i } ? :editors : :'no-editors'
125
+ end
126
+
127
+ # TODO Translated
128
+
129
+ Feature.define :location do |token, stripped, sequence, offset|
130
+ ((offset.to_f / sequence.length) * 10).round
131
+ end
132
+
133
+ Feature.define :punctuation do |token, stripped, sequence, offset|
134
+ case token
135
+ when /^["'”’´‘“`]/
136
+ :quote
137
+ when /["'”’´‘“`][!\?\.]$/
138
+ :'terminal-unquote'
139
+ when /["'”’´‘“`][,;:-]$/
140
+ :'internal-unquote'
141
+ when /["'”’´‘“`]$/
142
+ :unquote
143
+ when /^[\[\{].*[\}\]][!\?\.,;:-]?$/
144
+ :braces
145
+ when /^<.*>[!\?\.,;:-]?$/
146
+ :tags
147
+ when /^[\(].*[\)][!\?\.]$/
148
+ :'terminal-parens'
149
+ when /^\(.*\)[,;:-]$/
150
+ :'internal-parens'
151
+ when /^\(.*\)$/
152
+ :parens
153
+ when /^[\[\{]/
154
+ :'opening-brace'
155
+ when /[\}\]][!\?\.,;:-]?$/
156
+ :'closing-brace'
157
+ when /^</
158
+ :'opening-tag'
159
+ when />[!\?\.,;:-]?$/
160
+ :'closing-tag'
161
+ when /^\(/
162
+ :'opening-parens'
163
+ when /\)[,;:-]$/
164
+ :'internal-closing-parens'
165
+ when /^\)$/
166
+ :'closing-parens'
167
+ when /[,;:-]$/
168
+ :internal
169
+ when /[!\?\."']$/
170
+ :terminal
171
+ when /^\d{2,5}\(\d{2,5}\).?$/
172
+ :volume
173
+ when /-+/
174
+ :hyphen
175
+ else
176
+ :others
177
+ end
178
+ end
179
+
180
+
181
+ Feature.define :type do |token, stripped, sequence, offset|
182
+ s = sequence.join(' ')
183
+ case
184
+ when s =~ /dissertation abstract/i
185
+ :dissertaion
186
+ when s =~ /proceeding/i
187
+ :proceedings
188
+ when stripped =~ /^in$/i && sequence[offset+1].to_s =~ /^[[:upper:]]/ && sequence[offset-1].to_s =~ /["'”’´‘“`\.;,]$/
189
+ :collection
190
+ else
191
+ :other
192
+ end
193
+ end
194
+
195
+ Feature.define :reference do |token, stripped, sequence, offset|
196
+ case token
197
+ when /retrieved/i
198
+ :retrieved
199
+ when /isbn/i
200
+ :isbn
201
+ when /^doi:/i
202
+ :doi
203
+ when /^url|http|www\.[\w\.]+/i
204
+ :url
205
+ else
206
+ :none
207
+ end
208
+ end
209
+
210
+ end
211
+ end