excite 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
@@ -0,0 +1,654 @@
1
+ addison
2
+ alexander
3
+ alfred
4
+ allan
5
+ allen
6
+ allyn
7
+ andre
8
+ angus
9
+ berkley
10
+ blake
11
+ boyd
12
+ brooks
13
+ carroll
14
+ charles
15
+ chick
16
+ clark
17
+ cole
18
+ court
19
+ del
20
+ delmar
21
+ dick
22
+ faber
23
+ fitzgerald
24
+ francis
25
+ franklin
26
+ fraser
27
+ george
28
+ gordon
29
+ gregg
30
+ grove
31
+ hale
32
+ hall
33
+ hamilton
34
+ hamish
35
+ harcourt
36
+ harry
37
+ hill
38
+ holt
39
+ ian
40
+ ivan
41
+ james
42
+ john
43
+ joseph
44
+ kelly
45
+ lane
46
+ law
47
+ lion
48
+ lock
49
+ marcel
50
+ marion
51
+ marshall
52
+ matthias
53
+ mead
54
+ merrill
55
+ michael
56
+ miles
57
+ morgan
58
+ murray
59
+ nelson
60
+ north
61
+ orion
62
+ owen
63
+ pascal
64
+ patrick
65
+ paul
66
+ peter
67
+ prentice
68
+ putnam
69
+ ray
70
+ reed
71
+ robert
72
+ saunders
73
+ simon
74
+ smith
75
+ stanford
76
+ taylor
77
+ thomas
78
+ victor
79
+ wadsworth
80
+ walker
81
+ ward
82
+ warner
83
+ wesley
84
+ west
85
+ wiley
86
+ winston
87
+ xavier
88
+ yale
89
+ ardis
90
+ concordia
91
+ dee
92
+ garland
93
+ jada
94
+ lucie
95
+ minerva
96
+ sophia
97
+ star
98
+ williams
99
+ brown
100
+ lewis
101
+ baker
102
+ collins
103
+ barnes
104
+ long
105
+ black
106
+ mills
107
+ stone
108
+ hudson
109
+ stephens
110
+ andrews
111
+ chapman
112
+ little
113
+ pearson
114
+ holland
115
+ hopkins
116
+ watts
117
+ haynes
118
+ bass
119
+ wilkins
120
+ collier
121
+ hutchinson
122
+ blackwell
123
+ johns
124
+ golden
125
+ english
126
+ house
127
+ petty
128
+ good
129
+ workman
130
+ andersen
131
+ dodd
132
+ springer
133
+ blue
134
+ tuttle
135
+ dunham
136
+ abrams
137
+ mcgraw
138
+ thomson
139
+ street
140
+ crowell
141
+ hackett
142
+ london
143
+ dutton
144
+ boston
145
+ sams
146
+ schuster
147
+ duckworth
148
+ brunner
149
+ pickering
150
+ rinehart
151
+ graf
152
+ kraus
153
+ south
154
+ churchill
155
+ box
156
+ falcon
157
+ new
158
+ houghton
159
+ cave
160
+ canada
161
+ council
162
+ dover
163
+ mosby
164
+ manson
165
+ sparrow
166
+ peace
167
+ harwood
168
+ spring
169
+ robson
170
+ brill
171
+ apple
172
+ scribner
173
+ pitman
174
+ cassell
175
+ book
176
+ macmillan
177
+ deutsch
178
+ bridge
179
+ wales
180
+ oxford
181
+ wisdom
182
+ calder
183
+ dryden
184
+ copper
185
+ mcclurg
186
+ kaufmann
187
+ rager
188
+ reader
189
+ manchester
190
+ wine
191
+ to
192
+ kessinger
193
+ creek
194
+ telford
195
+ phoenix
196
+ sohn
197
+ belknap
198
+ grafton
199
+ spoon
200
+ heinemann
201
+ pan
202
+ sons
203
+ press
204
+ touchstone
205
+ dublin
206
+ orchard
207
+ capo
208
+ cambridge
209
+ auerbach
210
+ knopf
211
+ dekker
212
+ virgin
213
+ livingstone
214
+ atlas
215
+ daw
216
+ harvill
217
+ river
218
+ maclennan
219
+ books
220
+ mifflin
221
+ newbury
222
+ an
223
+ harvard
224
+ stackpole
225
+ harbor
226
+ hodder
227
+ doubleday
228
+ columbia
229
+ ballantine
230
+ student
231
+ world
232
+ imperial
233
+ vent
234
+ quay
235
+ hogarth
236
+ routledge
237
+ melbourne
238
+ arts
239
+ dorset
240
+ wordsworth
241
+ sovereign
242
+ heinle
243
+ cavendish
244
+ brookings
245
+ rover
246
+ glas
247
+ america
248
+ ace
249
+ thornes
250
+ russian
251
+ end
252
+ emerald
253
+ quiet
254
+ lights
255
+ city
256
+ breach
257
+ times
258
+ sadan
259
+ mcmeel
260
+ longman
261
+ harvest
262
+ college
263
+ va
264
+ godine
265
+ bison
266
+ plume
267
+ media
268
+ course
269
+ uk
270
+ tulip
271
+ oak
272
+ left
273
+ zephyr
274
+ war
275
+ verso
276
+ station
277
+ mini
278
+ duxbury
279
+ american
280
+ da
281
+ que
282
+ california
283
+ australia
284
+ michigan
285
+ chicago
286
+ central
287
+ indiana
288
+ toronto
289
+ minnesota
290
+ kentucky
291
+ iowa
292
+ nebraska
293
+ alaska
294
+ epworth
295
+ arcadia
296
+ usa
297
+ methuen
298
+ university
299
+ of
300
+ athlone
301
+ canyon
302
+ princeton
303
+ progress
304
+ prospect
305
+ lutterworth
306
+ egg
307
+ cloverdale
308
+ atlantic
309
+ triangle
310
+ kindersley
311
+ a
312
+ abode
313
+ academic
314
+ architectural
315
+ association
316
+ autodesk
317
+ b
318
+ bacan
319
+ bertelsmannspringer
320
+ birkhaeuser
321
+ business
322
+ cold
323
+ company
324
+ copernicus
325
+ corporation
326
+ crc
327
+ e
328
+ economics
329
+ editions
330
+ education
331
+ educational
332
+ ersnt
333
+ europa
334
+ f
335
+ fence
336
+ for
337
+ futura
338
+ g
339
+ gmbh
340
+ group
341
+ humana
342
+ ieee
343
+ inc
344
+ ingress
345
+ institute
346
+ international
347
+ itp
348
+ jossey
349
+ kluwer
350
+ laboratory
351
+ learning
352
+ ltd
353
+ mathematical
354
+ medical
355
+ microbiology
356
+ milady
357
+ n
358
+ narosa
359
+ neurological
360
+ onword
361
+ oultedge
362
+ peachpit
363
+ penguin
364
+ physica
365
+ physics
366
+ plenum
367
+ professional
368
+ psychological
369
+ psychology
370
+ ptr
371
+ pty
372
+ publications
373
+ publishers
374
+ publishing
375
+ pvt
376
+ riders
377
+ routledgeflamer
378
+ s
379
+ science
380
+ singular
381
+ society
382
+ software
383
+ southwestern
384
+ spon
385
+ st
386
+ steinkopff
387
+ surgeons
388
+ technip
389
+ technology
390
+ the
391
+ verlag
392
+ vnr
393
+ w
394
+ wileyheyden
395
+ wileyliss
396
+ wileylnterscience
397
+ wileyvch
398
+ wissenschafts
399
+ abacus
400
+ ac
401
+ addison-wesley
402
+ ak
403
+ akashic
404
+ aladdin
405
+ allenunwin
406
+ and
407
+ anvil
408
+ arcade
409
+ archive
410
+ arkham
411
+ artscroll
412
+ associated
413
+ at
414
+ atheneum
415
+ atom
416
+ avari
417
+ baen
418
+ bantam
419
+ barriejenkins
420
+ basic
421
+ bbc
422
+ birkhauser
423
+ blackstaff
424
+ blackwell's
425
+ bloodaxe
426
+ blooming
427
+ bloomsbury
428
+ bobbs-merrill
429
+ booksforabuckcom
430
+ bookshops
431
+ borzoi
432
+ boyars
433
+ boydellbrewer
434
+ boyds
435
+ bswk
436
+ butterworth-heinemann
437
+ bw
438
+ c
439
+ canongate
440
+ carcanet
441
+ carrer
442
+ catalog
443
+ catbird
444
+ century
445
+ change
446
+ chicken
447
+ classics
448
+ club
449
+ cockerel
450
+ communications
451
+ companionguidescom
452
+ continuum
453
+ coronet
454
+ counterpoint
455
+ creation
456
+ crockerbrewster
457
+ daedalus
458
+ dalkey
459
+ deagostini
460
+ dedalus
461
+ delacorte
462
+ denbridge
463
+ desk
464
+ detrituscom
465
+ directions
466
+ directmedia
467
+ dorling
468
+ eburon
469
+ ecw
470
+ eerdmans
471
+ ellora's
472
+ elsevier
473
+ enterprises
474
+ equity
475
+ european
476
+ everyman's
477
+ exact
478
+ eyrespottiswoode
479
+ gk
480
+ gollancz
481
+ goose
482
+ greenery
483
+ hallcompany
484
+ harlequin
485
+ harperbrothers
486
+ harpercollins
487
+ harperprism
488
+ harperrow
489
+ harpertrophy
490
+ hbj
491
+ headline
492
+ helter
493
+ hesperus
494
+ hms
495
+ hmso
496
+ hodderstoughton
497
+ hotei
498
+ hyperion
499
+ idc
500
+ imprint
501
+ information
502
+ inpress
503
+ inscape
504
+ insomniac
505
+ institution
506
+ interlink
507
+ inter-varsity
508
+ intervarsity
509
+ ivyspring
510
+ jarrolds
511
+ kegan
512
+ kehot
513
+ kodansha
514
+ kregel
515
+ ladybird
516
+ legend
517
+ library
518
+ limited
519
+ llc
520
+ lobster
521
+ lockco
522
+ mainstream
523
+ mandrake
524
+ manuals
525
+ martin's
526
+ martinus
527
+ masquerade
528
+ mcfarlandcompany
529
+ medknow
530
+ miegunyah
531
+ millsboon
532
+ mit
533
+ mjs
534
+ mocho
535
+ modern
536
+ monthly
537
+ monument
538
+ museum
539
+ mycroftmoran
540
+ nauka
541
+ newnes
542
+ nijhoff
543
+ nonesuch
544
+ noontide
545
+ northpoint
546
+ northwestern
547
+ nortoncompany
548
+ nyrb
549
+ office
550
+ online
551
+ open
552
+ o'reilly
553
+ overlook
554
+ oxfam
555
+ p
556
+ palgrave
557
+ pantheon
558
+ paperbacks
559
+ parallax
560
+ pathfinder
561
+ paulist
562
+ pavilion
563
+ perennis
564
+ pergamon
565
+ phaidon
566
+ philtrum
567
+ picador
568
+ piccadilly
569
+ pimlico
570
+ plc
571
+ poetry
572
+ poseidon
573
+ presses
574
+ prometheus
575
+ publication
576
+ publisher
577
+ puffin
578
+ pulpnet
579
+ putnam's
580
+ quartet
581
+ quebecor
582
+ r
583
+ random
584
+ recorded
585
+ reference
586
+ rmit
587
+ rockcompany
588
+ routledgesons
589
+ rubicon
590
+ scarecrow
591
+ schocken
592
+ schofieldsims
593
+ scholars
594
+ scholastic
595
+ scientific
596
+ scm
597
+ scribner's
598
+ seckerwarburg
599
+ sensorotika
600
+ shambhala
601
+ shanti
602
+ shoemakerhoard
603
+ sidgwickjackson
604
+ signet
605
+ skelter
606
+ snake
607
+ spacepol
608
+ spck
609
+ spectra
610
+ spottiswoode
611
+ stantonlee
612
+ stationery
613
+ steerforth
614
+ stovepipe
615
+ sunmoon
616
+ suny
617
+ tachyon
618
+ tantivy
619
+ target
620
+ tartarus
621
+ taschen
622
+ taylorfrancis
623
+ tenspeed
624
+ thameshudson
625
+ third
626
+ ticonderoga
627
+ time
628
+ tor
629
+ trade
630
+ tree
631
+ tt
632
+ twisted
633
+ ucl
634
+ unfinished
635
+ usborne
636
+ vedanta
637
+ viking
638
+ vintage
639
+ virago
640
+ vision
641
+ voyager
642
+ vsp
643
+ website
644
+ weidenfeldnicolson
645
+ wesleyan
646
+ westbow
647
+ wildside
648
+ wileysons
649
+ windrush
650
+ women’s
651
+ writing
652
+ ww
653
+ yoseloff
654
+ zondervan