excite 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
@@ -0,0 +1,641 @@
1
+ # encoding: UTF-8
2
+
3
+ module Excite
4
+
5
+ describe TokenFeatures do
6
+
7
+ before do
8
+ @crfparser = CRFParser.new
9
+ @ref = " W. H. Enright. Improving the efficiency of matrix operations in the numerical solution of stiff ordinary differential equations. ACM Trans. Math. Softw., 4(2), 127-136, June 1978. "
10
+ @tokens = @crfparser.prepare_token_data(@ref.strip)
11
+ end
12
+
13
+ it 'features' do
14
+ @crfparser.token_features.each {|f|
15
+ @tokens.each_with_index {|tok, i|
16
+ self.send("tok_test_#{f}", f, @tokens, i)
17
+ }
18
+ }
19
+ end
20
+
21
+ it 'last_char' do
22
+ pairs = [[['woefij'], 'a'],
23
+ [['weofiw234809*&^*oeA'], 'A'],
24
+ [['D'], 'A'],
25
+ [['Da'], 'a'],
26
+ [['1t'], 'a'],
27
+ [['t'], 'a'],
28
+ [['*'], '*'],
29
+ [['!@#$%^&*('], '('],
30
+ [['t1'], 0],
31
+ [['1'], 0]]
32
+
33
+ pairs.each {|a, b|
34
+ assert_equal(b, @crfparser.last_char(a.map { |s| Token.new(s) }, 0))
35
+ }
36
+ end
37
+
38
+ it 'first_1_char' do
39
+ pairs = [[['woefij'], 'w'],
40
+ [['weofiw234809*&^*oeA'], 'w'],
41
+ [['D'], 'D'],
42
+ [['Da'], 'D'],
43
+ [['1t'], '1'],
44
+ [['t'], 't'],
45
+ [['*'], '*'],
46
+ [['!@#$%^&*('], '!'],
47
+ [['t1'], 't'],
48
+ [['1'], '1']]
49
+
50
+ pairs.each {|a, b|
51
+ assert_equal(b, @crfparser.first_1_char(a.map { |s| Token.new(s) }, 0))
52
+ }
53
+ end
54
+
55
+ it 'first_2_chars' do
56
+ pairs = [[['woefij'], 'wo'],
57
+ [['weofiw234809*&^*oeA'], 'we'],
58
+ [['D'], 'D'],
59
+ [['Da'], 'Da'],
60
+ [['1t'], '1t'],
61
+ [['t'], 't'],
62
+ [['*'], '*'],
63
+ [['!@#$%^&*('], '!@'],
64
+ [['t1'], 't1'],
65
+ [['1'], '1']]
66
+
67
+ pairs.each {|a, b|
68
+ assert_equal(b, @crfparser.first_2_chars(a.map { |s| Token.new(s) }, 0))
69
+ }
70
+ end
71
+
72
+ it 'first_3_chars' do
73
+ pairs = [[['woefij'], 'woe'],
74
+ [['weofiw234809*&^*oeA'], 'weo'],
75
+ [['D'], 'D'],
76
+ [['Da'], 'Da'],
77
+ [['1t'], '1t'],
78
+ [['t'], 't'],
79
+ [['*'], '*'],
80
+ [['!@#$%^&*('], '!@#'],
81
+ [['t1'], 't1'],
82
+ [['1'], '1']]
83
+
84
+ pairs.each {|a, b|
85
+ assert_equal(b, @crfparser.first_3_chars(a.map { |s| Token.new(s) }, 0))
86
+ }
87
+ end
88
+
89
+ it 'first_4_chars' do
90
+ pairs = [[['woefij'], 'woef'],
91
+ [['weofiw234809*&^*oeA'], 'weof'],
92
+ [['D'], 'D'],
93
+ [['Da'], 'Da'],
94
+ [['Dax0'], 'Dax0'],
95
+ [['1t'], '1t'],
96
+ [['t'], 't'],
97
+ [['*'], '*'],
98
+ [['!@#$%^&*('], '!@#$'],
99
+ [['t1'], 't1'],
100
+ [['1'], '1']]
101
+
102
+ pairs.each {|a, b|
103
+ assert_equal(b, @crfparser.first_4_chars(a.map { |s| Token.new(s) }, 0))
104
+ }
105
+ end
106
+
107
+ it 'first_5_chars' do
108
+ pairs = [[['woefij'], 'woefi'],
109
+ [['weofiw234809*&^*oeA'], 'weofi'],
110
+ [['D'], 'D'],
111
+ [['DadaX'], 'DadaX'],
112
+ [['Da'], 'Da'],
113
+ [['Dax0'], 'Dax0'],
114
+ [['1t'], '1t'],
115
+ [['t'], 't'],
116
+ [['*'], '*'],
117
+ [['!@#$%^&*('], '!@#$%'],
118
+ [['t1'], 't1'],
119
+ [['1'], '1']]
120
+
121
+ pairs.each {|a, b|
122
+ assert_equal(b, @crfparser.first_5_chars(a.map { |s| Token.new(s) }, 0))
123
+ }
124
+ end
125
+
126
+ it 'last_1_char' do
127
+ pairs = [[['woefij'], 'j'],
128
+ [['weofiw234809*&^*oeA'], 'A'],
129
+ [['D'], 'D'],
130
+ [['DadaX'], 'X'],
131
+ [['Da'], 'a'],
132
+ [['Dax0'], '0'],
133
+ [['1t'], 't'],
134
+ [['t'], 't'],
135
+ [['*'], '*'],
136
+ [['!@#$%^&*('], '('],
137
+ [['t1'], '1'],
138
+ [['1'], '1']]
139
+
140
+ pairs.each {|a, b|
141
+ assert_equal(b, @crfparser.last_1_char(a.map { |s| Token.new(s) }, 0))
142
+ }
143
+ end
144
+
145
+ it 'last_2_chars' do
146
+ pairs = [[['woefij'], 'ij'],
147
+ [['weofiw234809*&^*oeA'], 'eA'],
148
+ [['D'], 'D'],
149
+ [['DadaX'], 'aX'],
150
+ [['Da'], 'Da'],
151
+ [['Dax0'], 'x0'],
152
+ [['1t'], '1t'],
153
+ [['t'], 't'],
154
+ [['*'], '*'],
155
+ [['!@#$%^&*('], '*('],
156
+ [['t1'], 't1'],
157
+ [['1'], '1']]
158
+
159
+ pairs.each {|a, b|
160
+ assert_equal(b, @crfparser.last_2_chars(a.map { |s| Token.new(s) }, 0))
161
+ }
162
+ end
163
+
164
+ it 'last_3_chars' do
165
+ pairs = [[['woefij'], 'fij'],
166
+ [['weofiw234809*&^*oeA'], 'oeA'],
167
+ [['D'], 'D'],
168
+ [['DadaX'], 'daX'],
169
+ [['Da'], 'Da'],
170
+ [['Dax0'], 'ax0'],
171
+ [['1t'], '1t'],
172
+ [['t'], 't'],
173
+ [['*'], '*'],
174
+ [['!@#$%^&*('], '&*('],
175
+ [['t1'], 't1'],
176
+ [['1'], '1']]
177
+
178
+ pairs.each {|a, b|
179
+ assert_equal(b, @crfparser.last_3_chars(a.map { |s| Token.new(s) }, 0))
180
+ }
181
+ end
182
+
183
+ it 'last_4_chars' do
184
+ pairs = [[['woefij'], 'efij'],
185
+ [['weofiw234809*&^*oeA'], '*oeA'],
186
+ [['D'], 'D'],
187
+ [['DadaX'], 'adaX'],
188
+ [['Da'], 'Da'],
189
+ [['Dax0'], 'Dax0'],
190
+ [['1t'], '1t'],
191
+ [['t'], 't'],
192
+ [['*'], '*'],
193
+ [['!@#$%^&*('], '^&*('],
194
+ [['t1'], 't1'],
195
+ [['1'], '1']]
196
+
197
+ pairs.each {|a, b|
198
+ assert_equal(b, @crfparser.last_4_chars(a.map { |s| Token.new(s) }, 0))
199
+ }
200
+ end
201
+
202
+ it 'capitalization' do
203
+ pairs = [[["W"], 'singleCap'],
204
+ [["Enright"], 'InitCap'],
205
+ [["IMPROVING"], 'AllCap'],
206
+ [["ThE234"], 'InitCap'],
207
+ [["efficiency"], 'others'],
208
+ [["1978"], 'others'],
209
+ [[","], 'others']]
210
+
211
+ pairs.each {|a, b|
212
+ assert_equal(b, @crfparser.capitalization(a.map { |s| Token.new(s) }, 0))
213
+ }
214
+ end
215
+
216
+ it 'numbers' do
217
+ pairs =
218
+ [[['12-34'], 'possiblePage'],
219
+ [['19-99'], 'possiblePage'],
220
+ [['19(99):'], 'year'],
221
+ [['19(99)'], 'year'],
222
+ [['(8999)'], '4+dig'],
223
+ [['(1999)'], 'year'],
224
+ [['(2999)23094'], '4+dig'],
225
+ [['wer(299923094'], 'hasDig'],
226
+ [['2304$%^&89ddd=)'], 'hasDig'],
227
+ [['2304$%^&89=)'], '4+dig'],
228
+ [['3$%^&'], '1dig'],
229
+ [['3st'], 'ordinal'],
230
+ [['3rd'], 'ordinal'],
231
+ [['989u83rd'], 'hasDig'],
232
+ [['.2.5'], '2dig'],
233
+ [['1.2.5'], '3dig'],
234
+ [['(1999a)'], 'year'],
235
+ [['a1a'], 'hasDig'],
236
+ [['awef20.09woeifj'], 'hasDig'],
237
+ [['awef2009woeifj'], 'year']]
238
+
239
+ pairs.each {|a, b|
240
+ assert_equal(b, @crfparser.numbers(a.map { |s| Token.new(s) }, 0))
241
+ }
242
+ end
243
+
244
+ it 'possible_editor' do
245
+ ee = %w(ed editor editors eds edited)
246
+ ee.each {|e|
247
+ @crfparser.clear
248
+ assert_equal("possibleEditors", @crfparser.possible_editor([Token.new(e)], 0))
249
+ }
250
+
251
+ @crfparser.possible_editor(ee.map { |s| Token.new(s) }, 0)
252
+ @crfparser.possible_editor([Token.new("foo")], 0).should == 'possibleEditors'
253
+
254
+ @crfparser.clear
255
+ ee = %w(foo bar 123SFOIEJ EDITUR)
256
+ @crfparser.possible_editor(ee.map { |s| Token.new(s) }, 0).should == 'noEditors'
257
+ end
258
+
259
+ it 'possible_chapter' do
260
+ refs = ['Flajolet, P., Gourdon, X., and Panario, D. Random polynomials and polynomial factorization. In Automata, Languages, and Programming (1996), F. Meyer auf der Heide and B. Monien, Eds., vol. 1099, of Lecture Notes in Computer Science, Springer-Verlag, pp. 232-243. Proceedings of the 23rd ICALP Conference, Paderborn, July 1996.',
261
+ 'JA Anderson (1977). Neural models with cognitive implications. In: D. LaBerge and S.J. Samuels (Eds .), Basic Processes in Reading: Perception and Comprehension . Hillsdale, New Jersey: Erlbaum Associates.',
262
+ 'Morgan, J.R. and Yarmush M.L. Gene Therapy in Tissue Engineering. In "Frontiers in Tissue Engineering", Patrick, Jr., CW, Mikos, AG, McIntire, LV, (eds.) Pergamon; Elsevier Science Publishers, Amsterdam, The Netherlands 1998; Chapter II.15 278- 310.',
263
+ 'Morgan, J.R. "Genetic Engineering of Skin Substitutes" In, Bioengineering of Skin Substitutes, International Business Communications, Inc., Southborough, MA 1998; Chapter 1.4., 61-73',
264
+ 'Sheridan, R.L., Morgan, J.R. and Mohammed, R. Biomaterials in Burn and Wound Dressings. In “Polymeric Biomaterials, Second Edition, Revised and Expanded”, Dumitriu (Ed) Marcel Dekker Inc. New York, 2001: Chapter 17; 451-458.']
265
+
266
+ not_refs = ['Morse, D. H. 2006. Predator upon a flower. In final revision, scheduled for publication in Fall 2006 or Winter 2007. Harvard University Press. (ca. 400 pp.).',
267
+ 'Goldstein J, Perello M, and Nillni EA . 2005. PreproThyrotropin-Releasing Hormone178-199 Affects Tyrosine Hydroxylase Biosynthesis in Hypothalamic Neurons: a Possible Role for Pituitary Prolactin Regulation . In press Journal of Molecular Neuroscience 2006.',
268
+ 'Mulcahy LR, and Nillni EA. 2007 . Invited Review. The role of prohormone processing in the physiology of energy balance. Frontiers in Bioscience.']
269
+
270
+ refs.each do |s|
271
+ tokens = @crfparser.prepare_token_data(s)
272
+ @crfparser.possible_chapter(tokens).should == 'possibleChapter'
273
+ end
274
+
275
+ not_refs.each do |s|
276
+ tokens = @crfparser.prepare_token_data(s)
277
+ @crfparser.possible_chapter(tokens).should == 'noChapter'
278
+ end
279
+ end
280
+
281
+ it 'in book' do
282
+ book_tokens = @crfparser.prepare_token_data('end. In "Title')
283
+
284
+ book_tokens.length.should == 5
285
+ (0..book_tokens.length-1).each do |i|
286
+ expected = i == 2 ? 'inBook' : 'notInBook'
287
+ @crfparser.is_in(book_tokens, i).should == expected
288
+ end
289
+
290
+ @crfparser.is_in(['a','b','c'].map { |s| Token.new(s) }, 1).should == 'notInBook'
291
+ end
292
+
293
+ it 'possible volume' do
294
+ spaced_vols = [
295
+ "Vol. 5, No. 6",
296
+ "vol. 2 no. 1",
297
+ "Volume 22 Issue 14"
298
+ ]
299
+ spaced_vols.each do |vol|
300
+ toks = @crfparser.prepare_token_data(vol)
301
+ [0,1].each { |i| @crfparser.possible_volume(toks, i).should == 'volume' }
302
+ [-2,-1].each { |i| @crfparser.possible_volume(toks, i).should == 'issue' }
303
+ end
304
+
305
+ compressed_vols = [
306
+ "Vol.19, No.1",
307
+ "Vol.19 No.1",
308
+ ]
309
+ compressed_vols.each do |vol|
310
+ toks = @crfparser.prepare_token_data(vol)
311
+ @crfparser.possible_volume(toks, 0).should == 'volume'
312
+ @crfparser.possible_volume(toks, -1).should == 'issue'
313
+ end
314
+
315
+ parens_vols = [
316
+ "10(7)",
317
+ "21(2):",
318
+ "44 (2)",
319
+ "38( 3)"
320
+ ]
321
+ parens_vols.each do |vol|
322
+ toks = @crfparser.prepare_token_data(vol)
323
+ @crfparser.possible_volume(toks, 0).should == 'volume'
324
+ (1..3).each { |i| @crfparser.possible_volume(toks, i).should == 'issue' }
325
+ end
326
+
327
+ colon_vols = [
328
+ "19:3-4",
329
+ "29:1, 287-291.",
330
+ "38: 942-949.",
331
+ "1: 3",
332
+ ]
333
+ colon_vols.each do |vol|
334
+ toks = @crfparser.prepare_token_data(vol)
335
+ @crfparser.possible_volume(toks, 0).should == 'volume'
336
+ (1..toks.length-1).each { |i| @crfparser.possible_volume(toks, i).should == 'noVolume' }
337
+ end
338
+
339
+ not_vols = [
340
+ "18, 1988",
341
+ "1977.",
342
+ "(2002)",
343
+ "No. 1: (2002)",
344
+ "4 234 34093",
345
+ "122-123,",
346
+ "38, (3)",
347
+ "2007: 35-65",
348
+ "Web 2.0: an",
349
+ "this volume",
350
+ "1 issue",
351
+ "issue, 1st",
352
+ "no. 1"
353
+ ]
354
+ not_vols.each do |vol|
355
+ toks = @crfparser.prepare_token_data(vol)
356
+ (0..toks.length-1).each { |i| @crfparser.possible_volume(toks, i).should == 'noVolume' }
357
+ end
358
+ end
359
+
360
+ it 'parts of speech' do
361
+ tokens = @crfparser.prepare_token_data('Yo: "This is a test. This is only a test"')
362
+ tokens.map(&:part_of_speech).should ==
363
+ ["nnp", "pps", "ppl", "det", "vbz", "det", "nn", "pp", "det", "vbz", "rb", "det", "nn", "ppr"]
364
+
365
+ @crfparser.part_of_speech(tokens, 0).should == 'nnp'
366
+ end
367
+
368
+ it "punct" do
369
+ tokens = @crfparser.prepare_token_data("A. Smith wrote some-\nthing: AB-CD-EF")
370
+ labels = (0..tokens.length-1).map{ |i| @crfparser.punct(tokens, i) }
371
+ labels.should == ['abbrev','others','others','truncated','others','hasPunct','multiHyphen']
372
+ end
373
+
374
+ it "names" do
375
+ tokens = @crfparser.prepare_token_data("John Smith")
376
+
377
+ @crfparser.firstName(tokens, 0).should == 'firstName'
378
+ @crfparser.lastName(tokens, 1).should == 'lastName'
379
+ end
380
+
381
+ it "name not in dict" do
382
+ name = 'Arble D Garbled'
383
+ processed_name = @crfparser.normalize_input_author(name)
384
+ tokens = @crfparser.prepare_token_data(name)
385
+ result = (0..tokens.length-1).map do |i|
386
+ {
387
+ fname_from_dict: @crfparser.firstName(tokens, i),
388
+ lname_from_dict: @crfparser.lastName(tokens, i),
389
+ fname_with_given: @crfparser.firstName(tokens, i, processed_name),
390
+ lname_with_given: @crfparser.lastName(tokens, i, processed_name)
391
+ }
392
+ end
393
+
394
+ result.each { |r| r[:fname_from_dict].should == 'noFirstName' }
395
+ result.each { |r| r[:lname_from_dict].should == 'noLastName' }
396
+
397
+ result[0][:fname_with_given].should == 'firstName'
398
+ result[1][:fname_with_given].should == 'noFirstName'
399
+ result[2][:fname_with_given].should == 'noFirstName'
400
+
401
+ result[0][:lname_with_given].should == 'noLastName'
402
+ result[1][:lname_with_given].should == 'noLastName'
403
+ result[2][:lname_with_given].should == 'lastName'
404
+ end
405
+
406
+ private
407
+
408
+ def tok_test_toklcnp(f, toks, idx)
409
+ a = nil
410
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
411
+ assert_equal(toks[idx].lcnp, a)
412
+ end
413
+
414
+ def tok_test_placeName(f, toks, idx)
415
+ a = nil
416
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
417
+ assert(['noPlaceName', 'placeName'].include?(a))
418
+ end
419
+
420
+ def tok_test_last_1_char(f, toks, idx)
421
+ a = nil
422
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
423
+ if toks[idx].raw.length <= 1
424
+ assert_equal(toks[idx].raw, a)
425
+ else
426
+ assert_equal(1, a.length)
427
+ end
428
+ assert(toks[idx].raw.end_with?(a))
429
+ end
430
+
431
+ def tok_test_first_2_chars(f, toks, idx)
432
+ a = nil
433
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
434
+ if toks[idx].raw.length <= 2
435
+ assert_equal(toks[idx].raw, a)
436
+ else
437
+ assert_equal(2, a.length)
438
+ end
439
+ assert(toks[idx].raw.start_with?(a))
440
+ end
441
+
442
+ def tok_test_possible_editor(f, toks, idx)
443
+ a = nil
444
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
445
+ assert(["possibleEditors", "noEditors"].include?(a))
446
+ end
447
+
448
+ def tok_test_location(f, toks, idx)
449
+ a = nil
450
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
451
+ assert_equal(Fixnum, a.class)
452
+ assert(a >= 0)
453
+ assert(a <= 10)
454
+ end
455
+
456
+ def tok_test_in_book(f, toks, idx)
457
+ a = nil
458
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
459
+ assert(['inBook', 'notInBook'].include?(a))
460
+ end
461
+
462
+ def tok_test_firstName(f, toks, idx)
463
+ a = nil
464
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
465
+ assert(['noFirstName', 'firstName'].include?(a))
466
+ end
467
+
468
+ def tok_test_last_char(f, toks, idx)
469
+ a = nil
470
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
471
+ a.to_s.length.should == 1
472
+ assert(a == 'a' || a == 'A' || a == 0 || toks[idx].raw.end_with?(a))
473
+ end
474
+
475
+ def tok_test_last_4_chars(f, toks, idx)
476
+ a = nil
477
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
478
+ if toks[idx].raw.length <= 4
479
+ assert_equal(toks[idx].raw, a)
480
+ else
481
+ assert_equal(4, a.length)
482
+ end
483
+ assert(toks[idx].raw.end_with?(a))
484
+ end
485
+
486
+ def tok_test_publisherName(f, toks, idx)
487
+ a = nil
488
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
489
+ assert(['noPublisherName', 'publisherName'].include?(a))
490
+ end
491
+
492
+ def tok_test_first_5_chars(f, toks, idx)
493
+ a = nil
494
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
495
+ if toks[idx].raw.length <= 5
496
+ assert_equal(toks[idx].raw, a)
497
+ else
498
+ assert_equal(5, a.length)
499
+ end
500
+ assert(toks[idx].raw.start_with?(a))
501
+ end
502
+
503
+ def tok_test_is_in(f, toks, idx)
504
+ a = nil
505
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
506
+ assert(["inBook", "notInBook"].include?(a))
507
+ end
508
+
509
+ def tok_test_first_1_char(f, toks, idx)
510
+ a = nil
511
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
512
+ if toks[idx].raw.length <= 1
513
+ assert_equal(toks[idx].raw, a)
514
+ else
515
+ assert_equal(1, a.length)
516
+ end
517
+ end
518
+
519
+ def tok_test_numbers(f, toks, idx)
520
+ a = nil
521
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
522
+ b = ["year", "possiblePage", "possibleVol", "1dig", "2dig", "3dig",
523
+ "4+dig", "ordinal", "hasDig", "nonNum"].include?(a)
524
+ assert(b)
525
+ end
526
+
527
+ def tok_test_lastName(f, toks, idx)
528
+ a = nil
529
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
530
+ assert(['noLastName', 'lastName'].include?(a))
531
+ end
532
+
533
+ def tok_test_last_3_chars(f, toks, idx)
534
+ a = nil
535
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
536
+ if toks[idx].raw.length <= 3
537
+ assert_equal(toks[idx].raw, a)
538
+ else
539
+ assert_equal(3, a.length)
540
+ end
541
+ assert(toks[idx].raw.end_with?(a))
542
+ end
543
+
544
+ def tok_test_a_is_in_dict(f, toks, idx)
545
+ n = nil
546
+ assert_nothing_raised{n = @crfparser.send(f, toks, idx).class}
547
+ assert_equal(Fixnum, n)
548
+ end
549
+
550
+ def tok_test_first_4_chars(f, toks, idx)
551
+ a = nil
552
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
553
+ if toks[idx].raw.length <= 4
554
+ assert_equal(toks[idx].raw, a)
555
+ else
556
+ assert_equal(4, a.length)
557
+ end
558
+ assert(toks[idx].raw.start_with?(a))
559
+ end
560
+
561
+ def tok_test_is_proceeding(f, toks, idx)
562
+ a = nil
563
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
564
+ assert(['isProc', 'noProc'].include?(a))
565
+ end
566
+
567
+ def tok_test_capitalization(f, toks, idx)
568
+ a = nil
569
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
570
+ assert(["singleCap", "InitCap", "AllCap", "others"].include?(a))
571
+ end
572
+
573
+ def tok_test_monthName(f, toks, idx)
574
+ a = nil
575
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
576
+ assert(['noMonthName', 'monthName'].include?(a))
577
+ end
578
+
579
+ def tok_test_last_2_chars(f, toks, idx)
580
+ a = nil
581
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
582
+ if toks[idx].raw.length <= 2
583
+ assert_equal(toks[idx].raw, a)
584
+ else
585
+ assert_equal(2, a.length)
586
+ end
587
+ assert(toks[idx].raw.end_with?(a))
588
+ end
589
+
590
+ def tok_test_punct(f, toks, idx)
591
+ a = nil
592
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
593
+ b = %w(multiHyphen truncated abbrev hasPunct others).include?(a)
594
+ assert(b)
595
+ end
596
+
597
+ def tok_test_first_3_chars(f, toks, idx)
598
+ a = nil
599
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
600
+ if toks[idx].raw.length <= 3
601
+ assert_equal(toks[idx].raw, a)
602
+ else
603
+ assert_equal(3, a.length)
604
+ end
605
+ assert(toks[idx].raw.start_with?(a))
606
+ end
607
+
608
+ def tok_test_possible_chapter(f, toks, idx)
609
+ a = nil
610
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
611
+ assert(['possibleChapter', 'noChapter'].include?(a))
612
+ end
613
+
614
+ def tok_test_part_of_speech(f, toks, idx)
615
+ a = nil
616
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
617
+ a.should_not be_nil
618
+ end
619
+
620
+ def tok_test_possible_volume(f, toks, idx)
621
+ a = nil
622
+ assert_nothing_raised{a = @crfparser.send(f, toks, idx)}
623
+ ['volume','issue','noVolume'].include?(a)
624
+ end
625
+
626
+ # hacks for conversion from test unit
627
+ def assert(a)
628
+ a.should be_true
629
+ end
630
+
631
+ def assert_equal(a,b)
632
+ b.should == a
633
+ end
634
+
635
+ def assert_nothing_raised
636
+ yield
637
+ end
638
+
639
+ end
640
+
641
+ end
@@ -0,0 +1,4 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'excite'
4
+ require 'pry'