excite 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
@@ -0,0 +1,654 @@
1
+ addison
2
+ alexander
3
+ alfred
4
+ allan
5
+ allen
6
+ allyn
7
+ andre
8
+ angus
9
+ berkley
10
+ blake
11
+ boyd
12
+ brooks
13
+ carroll
14
+ charles
15
+ chick
16
+ clark
17
+ cole
18
+ court
19
+ del
20
+ delmar
21
+ dick
22
+ faber
23
+ fitzgerald
24
+ francis
25
+ franklin
26
+ fraser
27
+ george
28
+ gordon
29
+ gregg
30
+ grove
31
+ hale
32
+ hall
33
+ hamilton
34
+ hamish
35
+ harcourt
36
+ harry
37
+ hill
38
+ holt
39
+ ian
40
+ ivan
41
+ james
42
+ john
43
+ joseph
44
+ kelly
45
+ lane
46
+ law
47
+ lion
48
+ lock
49
+ marcel
50
+ marion
51
+ marshall
52
+ matthias
53
+ mead
54
+ merrill
55
+ michael
56
+ miles
57
+ morgan
58
+ murray
59
+ nelson
60
+ north
61
+ orion
62
+ owen
63
+ pascal
64
+ patrick
65
+ paul
66
+ peter
67
+ prentice
68
+ putnam
69
+ ray
70
+ reed
71
+ robert
72
+ saunders
73
+ simon
74
+ smith
75
+ stanford
76
+ taylor
77
+ thomas
78
+ victor
79
+ wadsworth
80
+ walker
81
+ ward
82
+ warner
83
+ wesley
84
+ west
85
+ wiley
86
+ winston
87
+ xavier
88
+ yale
89
+ ardis
90
+ concordia
91
+ dee
92
+ garland
93
+ jada
94
+ lucie
95
+ minerva
96
+ sophia
97
+ star
98
+ williams
99
+ brown
100
+ lewis
101
+ baker
102
+ collins
103
+ barnes
104
+ long
105
+ black
106
+ mills
107
+ stone
108
+ hudson
109
+ stephens
110
+ andrews
111
+ chapman
112
+ little
113
+ pearson
114
+ holland
115
+ hopkins
116
+ watts
117
+ haynes
118
+ bass
119
+ wilkins
120
+ collier
121
+ hutchinson
122
+ blackwell
123
+ johns
124
+ golden
125
+ english
126
+ house
127
+ petty
128
+ good
129
+ workman
130
+ andersen
131
+ dodd
132
+ springer
133
+ blue
134
+ tuttle
135
+ dunham
136
+ abrams
137
+ mcgraw
138
+ thomson
139
+ street
140
+ crowell
141
+ hackett
142
+ london
143
+ dutton
144
+ boston
145
+ sams
146
+ schuster
147
+ duckworth
148
+ brunner
149
+ pickering
150
+ rinehart
151
+ graf
152
+ kraus
153
+ south
154
+ churchill
155
+ box
156
+ falcon
157
+ new
158
+ houghton
159
+ cave
160
+ canada
161
+ council
162
+ dover
163
+ mosby
164
+ manson
165
+ sparrow
166
+ peace
167
+ harwood
168
+ spring
169
+ robson
170
+ brill
171
+ apple
172
+ scribner
173
+ pitman
174
+ cassell
175
+ book
176
+ macmillan
177
+ deutsch
178
+ bridge
179
+ wales
180
+ oxford
181
+ wisdom
182
+ calder
183
+ dryden
184
+ copper
185
+ mcclurg
186
+ kaufmann
187
+ rager
188
+ reader
189
+ manchester
190
+ wine
191
+ to
192
+ kessinger
193
+ creek
194
+ telford
195
+ phoenix
196
+ sohn
197
+ belknap
198
+ grafton
199
+ spoon
200
+ heinemann
201
+ pan
202
+ sons
203
+ press
204
+ touchstone
205
+ dublin
206
+ orchard
207
+ capo
208
+ cambridge
209
+ auerbach
210
+ knopf
211
+ dekker
212
+ virgin
213
+ livingstone
214
+ atlas
215
+ daw
216
+ harvill
217
+ river
218
+ maclennan
219
+ books
220
+ mifflin
221
+ newbury
222
+ an
223
+ harvard
224
+ stackpole
225
+ harbor
226
+ hodder
227
+ doubleday
228
+ columbia
229
+ ballantine
230
+ student
231
+ world
232
+ imperial
233
+ vent
234
+ quay
235
+ hogarth
236
+ routledge
237
+ melbourne
238
+ arts
239
+ dorset
240
+ wordsworth
241
+ sovereign
242
+ heinle
243
+ cavendish
244
+ brookings
245
+ rover
246
+ glas
247
+ america
248
+ ace
249
+ thornes
250
+ russian
251
+ end
252
+ emerald
253
+ quiet
254
+ lights
255
+ city
256
+ breach
257
+ times
258
+ sadan
259
+ mcmeel
260
+ longman
261
+ harvest
262
+ college
263
+ va
264
+ godine
265
+ bison
266
+ plume
267
+ media
268
+ course
269
+ uk
270
+ tulip
271
+ oak
272
+ left
273
+ zephyr
274
+ war
275
+ verso
276
+ station
277
+ mini
278
+ duxbury
279
+ american
280
+ da
281
+ que
282
+ california
283
+ australia
284
+ michigan
285
+ chicago
286
+ central
287
+ indiana
288
+ toronto
289
+ minnesota
290
+ kentucky
291
+ iowa
292
+ nebraska
293
+ alaska
294
+ epworth
295
+ arcadia
296
+ usa
297
+ methuen
298
+ university
299
+ of
300
+ athlone
301
+ canyon
302
+ princeton
303
+ progress
304
+ prospect
305
+ lutterworth
306
+ egg
307
+ cloverdale
308
+ atlantic
309
+ triangle
310
+ kindersley
311
+ a
312
+ abode
313
+ academic
314
+ architectural
315
+ association
316
+ autodesk
317
+ b
318
+ bacan
319
+ bertelsmannspringer
320
+ birkhaeuser
321
+ business
322
+ cold
323
+ company
324
+ copernicus
325
+ corporation
326
+ crc
327
+ e
328
+ economics
329
+ editions
330
+ education
331
+ educational
332
+ ersnt
333
+ europa
334
+ f
335
+ fence
336
+ for
337
+ futura
338
+ g
339
+ gmbh
340
+ group
341
+ humana
342
+ ieee
343
+ inc
344
+ ingress
345
+ institute
346
+ international
347
+ itp
348
+ jossey
349
+ kluwer
350
+ laboratory
351
+ learning
352
+ ltd
353
+ mathematical
354
+ medical
355
+ microbiology
356
+ milady
357
+ n
358
+ narosa
359
+ neurological
360
+ onword
361
+ oultedge
362
+ peachpit
363
+ penguin
364
+ physica
365
+ physics
366
+ plenum
367
+ professional
368
+ psychological
369
+ psychology
370
+ ptr
371
+ pty
372
+ publications
373
+ publishers
374
+ publishing
375
+ pvt
376
+ riders
377
+ routledgeflamer
378
+ s
379
+ science
380
+ singular
381
+ society
382
+ software
383
+ southwestern
384
+ spon
385
+ st
386
+ steinkopff
387
+ surgeons
388
+ technip
389
+ technology
390
+ the
391
+ verlag
392
+ vnr
393
+ w
394
+ wileyheyden
395
+ wileyliss
396
+ wileylnterscience
397
+ wileyvch
398
+ wissenschafts
399
+ abacus
400
+ ac
401
+ addison-wesley
402
+ ak
403
+ akashic
404
+ aladdin
405
+ allenunwin
406
+ and
407
+ anvil
408
+ arcade
409
+ archive
410
+ arkham
411
+ artscroll
412
+ associated
413
+ at
414
+ atheneum
415
+ atom
416
+ avari
417
+ baen
418
+ bantam
419
+ barriejenkins
420
+ basic
421
+ bbc
422
+ birkhauser
423
+ blackstaff
424
+ blackwell's
425
+ bloodaxe
426
+ blooming
427
+ bloomsbury
428
+ bobbs-merrill
429
+ booksforabuckcom
430
+ bookshops
431
+ borzoi
432
+ boyars
433
+ boydellbrewer
434
+ boyds
435
+ bswk
436
+ butterworth-heinemann
437
+ bw
438
+ c
439
+ canongate
440
+ carcanet
441
+ carrer
442
+ catalog
443
+ catbird
444
+ century
445
+ change
446
+ chicken
447
+ classics
448
+ club
449
+ cockerel
450
+ communications
451
+ companionguidescom
452
+ continuum
453
+ coronet
454
+ counterpoint
455
+ creation
456
+ crockerbrewster
457
+ daedalus
458
+ dalkey
459
+ deagostini
460
+ dedalus
461
+ delacorte
462
+ denbridge
463
+ desk
464
+ detrituscom
465
+ directions
466
+ directmedia
467
+ dorling
468
+ eburon
469
+ ecw
470
+ eerdmans
471
+ ellora's
472
+ elsevier
473
+ enterprises
474
+ equity
475
+ european
476
+ everyman's
477
+ exact
478
+ eyrespottiswoode
479
+ gk
480
+ gollancz
481
+ goose
482
+ greenery
483
+ hallcompany
484
+ harlequin
485
+ harperbrothers
486
+ harpercollins
487
+ harperprism
488
+ harperrow
489
+ harpertrophy
490
+ hbj
491
+ headline
492
+ helter
493
+ hesperus
494
+ hms
495
+ hmso
496
+ hodderstoughton
497
+ hotei
498
+ hyperion
499
+ idc
500
+ imprint
501
+ information
502
+ inpress
503
+ inscape
504
+ insomniac
505
+ institution
506
+ interlink
507
+ inter-varsity
508
+ intervarsity
509
+ ivyspring
510
+ jarrolds
511
+ kegan
512
+ kehot
513
+ kodansha
514
+ kregel
515
+ ladybird
516
+ legend
517
+ library
518
+ limited
519
+ llc
520
+ lobster
521
+ lockco
522
+ mainstream
523
+ mandrake
524
+ manuals
525
+ martin's
526
+ martinus
527
+ masquerade
528
+ mcfarlandcompany
529
+ medknow
530
+ miegunyah
531
+ millsboon
532
+ mit
533
+ mjs
534
+ mocho
535
+ modern
536
+ monthly
537
+ monument
538
+ museum
539
+ mycroftmoran
540
+ nauka
541
+ newnes
542
+ nijhoff
543
+ nonesuch
544
+ noontide
545
+ northpoint
546
+ northwestern
547
+ nortoncompany
548
+ nyrb
549
+ office
550
+ online
551
+ open
552
+ o'reilly
553
+ overlook
554
+ oxfam
555
+ p
556
+ palgrave
557
+ pantheon
558
+ paperbacks
559
+ parallax
560
+ pathfinder
561
+ paulist
562
+ pavilion
563
+ perennis
564
+ pergamon
565
+ phaidon
566
+ philtrum
567
+ picador
568
+ piccadilly
569
+ pimlico
570
+ plc
571
+ poetry
572
+ poseidon
573
+ presses
574
+ prometheus
575
+ publication
576
+ publisher
577
+ puffin
578
+ pulpnet
579
+ putnam's
580
+ quartet
581
+ quebecor
582
+ r
583
+ random
584
+ recorded
585
+ reference
586
+ rmit
587
+ rockcompany
588
+ routledgesons
589
+ rubicon
590
+ scarecrow
591
+ schocken
592
+ schofieldsims
593
+ scholars
594
+ scholastic
595
+ scientific
596
+ scm
597
+ scribner's
598
+ seckerwarburg
599
+ sensorotika
600
+ shambhala
601
+ shanti
602
+ shoemakerhoard
603
+ sidgwickjackson
604
+ signet
605
+ skelter
606
+ snake
607
+ spacepol
608
+ spck
609
+ spectra
610
+ spottiswoode
611
+ stantonlee
612
+ stationery
613
+ steerforth
614
+ stovepipe
615
+ sunmoon
616
+ suny
617
+ tachyon
618
+ tantivy
619
+ target
620
+ tartarus
621
+ taschen
622
+ taylorfrancis
623
+ tenspeed
624
+ thameshudson
625
+ third
626
+ ticonderoga
627
+ time
628
+ tor
629
+ trade
630
+ tree
631
+ tt
632
+ twisted
633
+ ucl
634
+ unfinished
635
+ usborne
636
+ vedanta
637
+ viking
638
+ vintage
639
+ virago
640
+ vision
641
+ voyager
642
+ vsp
643
+ website
644
+ weidenfeldnicolson
645
+ wesleyan
646
+ westbow
647
+ wildside
648
+ wileysons
649
+ windrush
650
+ women’s
651
+ writing
652
+ ww
653
+ yoseloff
654
+ zondervan