excite 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +11 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +8 -0
  4. data/Gemfile.lock +69 -0
  5. data/LICENSE +22 -0
  6. data/README.md +46 -0
  7. data/Rakefile +24 -0
  8. data/config/citation_cleanup_rules.yml +68 -0
  9. data/config/parscit_features.yml +55 -0
  10. data/excite.gemspec +30 -0
  11. data/lib/excite/array_helpers.rb +27 -0
  12. data/lib/excite/citation.rb +48 -0
  13. data/lib/excite/crfparser.rb +322 -0
  14. data/lib/excite/postprocessor.rb +252 -0
  15. data/lib/excite/preprocessor.rb +107 -0
  16. data/lib/excite/resources/dicts/female-names +4954 -0
  17. data/lib/excite/resources/dicts/first-names +27926 -0
  18. data/lib/excite/resources/dicts/male-names +3901 -0
  19. data/lib/excite/resources/dicts/months +24 -0
  20. data/lib/excite/resources/dicts/places +43109 -0
  21. data/lib/excite/resources/dicts/publishers +654 -0
  22. data/lib/excite/resources/dicts/surnames +146259 -0
  23. data/lib/excite/resources/html.template +84 -0
  24. data/lib/excite/resources/html_model +0 -0
  25. data/lib/excite/resources/model +0 -0
  26. data/lib/excite/resources/parsCit.template +76 -0
  27. data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
  28. data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
  29. data/lib/excite/resources/trainingdata/verify.rb +97 -0
  30. data/lib/excite/token_features.rb +313 -0
  31. data/lib/excite/version.rb +7 -0
  32. data/lib/excite.rb +13 -0
  33. data/model/test/analysis.csv +54 -0
  34. data/model/test/array_helpers.rb +30 -0
  35. data/model/test/html-analysis.csv +60 -0
  36. data/model/test/html-output.txt +19893 -0
  37. data/model/test/model_test.rb +306 -0
  38. data/model/test/output.txt +16742 -0
  39. data/spec/excite/citation_spec.rb +128 -0
  40. data/spec/excite/crfparser_spec.rb +118 -0
  41. data/spec/excite/postprocessor_spec.rb +68 -0
  42. data/spec/excite/token_features_spec.rb +641 -0
  43. data/spec/spec_helper.rb +4 -0
  44. metadata +222 -0
@@ -0,0 +1,84 @@
1
+ # See config/parscit_features.yml for function-to-number correlation
2
+
3
+ b# Unigram
4
+ U00:%x[-3,0]
5
+ U01:%x[-2,0]
6
+ U02:%x[-1,0]
7
+ U03:%x[0,0]
8
+ U04:%x[1,0]
9
+ U05:%x[2,0]
10
+ U06:%x[3,0]
11
+ U07:%x[-1,0]/%x[0,0]
12
+ U08:%x[0,0]/%x[1,0]
13
+
14
+ # last Char
15
+ U10:%x[0,1]
16
+ U11:%x[-1,1]
17
+
18
+ # first 1-4
19
+ U20:%x[0,2]
20
+ U21:%x[0,3]
21
+ U22:%x[0,4]
22
+ U23:%x[0,5]
23
+
24
+ # last 1-4
25
+ U30:%x[0,6]
26
+ U31:%x[0,7]
27
+ U32:%x[0,8]
28
+ U33:%x[0,9]
29
+
30
+ # Lowercased, no punct
31
+ U40:%x[-2,10]
32
+ U41:%x[-1,10]
33
+ U42:%x[0,10]
34
+ U43:%x[1,10]
35
+ U44:%x[2,10]
36
+
37
+ # Capitalization
38
+ U50:%x[0,11]
39
+
40
+ # Numbers
41
+ U60:%x[-1,12]
42
+ U61:%x[0,12]
43
+ U62:%x[1,12]
44
+ U63:%x[-1,12]/%x[0,12]
45
+ U63:%x[0,12]/%x[1,12]
46
+
47
+ # Dict info
48
+ U70:%x[0,13]
49
+ U71:%x[0,14]
50
+ U72:%x[0,15]
51
+ U73:%x[0,16]
52
+ U74:%x[0,17]
53
+ U75:%x[0,18]
54
+
55
+ # Possible editor?
56
+ U80:%x[0,19]
57
+
58
+ # Position
59
+ U90:%x[0,20]
60
+
61
+ # Punctuation
62
+ UA0:%x[0,21]
63
+
64
+ # possible chapter?
65
+ U90:%x[-1,25]/%x[0,22]/%x[0,19]
66
+ U91:%x[-1,25]/%x[0,22]/%x[1,11]
67
+
68
+ # html tag
69
+ UH2:%x[0,23]
70
+ UH3:%x[1,23]
71
+ UH4:%x[-1,23]
72
+ UH5:%x[-1,23]/%x[0,23]
73
+ UH6:%x[0,23]/%x[1,23]
74
+ UH7:%x[0,24]
75
+
76
+ # part of speech
77
+ UP0:%x[0,25]
78
+ UP1:%x[-1,25]
79
+ UP2:%x[1,25]
80
+ UP3:%x[-1,25]/%x[0,25]
81
+ UP4:%x[0,25]/%x[1,25]
82
+
83
+ # Bigram
84
+ B
Binary file
Binary file
@@ -0,0 +1,76 @@
1
+ # See config/parscit_features.yml for function-to-number correlation
2
+
3
+ # Unigram
4
+ U00:%x[-3,0]
5
+ U01:%x[-2,0]
6
+ U02:%x[-1,0]
7
+ U03:%x[0,0]
8
+ U04:%x[1,0]
9
+ U05:%x[2,0]
10
+ U06:%x[3,0]
11
+ U07:%x[-1,0]/%x[0,0]
12
+ U08:%x[0,0]/%x[1,0]
13
+
14
+ # last Char
15
+ U10:%x[0,1]
16
+ U11:%x[-1,1]
17
+
18
+ # first 1-4
19
+ U20:%x[0,2]
20
+ U21:%x[0,3]
21
+ U22:%x[0,4]
22
+ U23:%x[0,5]
23
+
24
+ # last 1-4
25
+ U30:%x[0,6]
26
+ U31:%x[0,7]
27
+ U32:%x[0,8]
28
+ U33:%x[0,9]
29
+
30
+ # Lowercased, no punct
31
+ U40:%x[-2,10]
32
+ U41:%x[-1,10]
33
+ U42:%x[0,10]
34
+ U43:%x[1,10]
35
+ U44:%x[2,10]
36
+
37
+ # Capitalization
38
+ U50:%x[0,11]
39
+
40
+ # Numbers
41
+ U60:%x[-1,12]
42
+ U61:%x[0,12]
43
+ U62:%x[1,12]
44
+ U63:%x[-1,12]/%x[0,12]
45
+ U63:%x[0,12]/%x[1,12]
46
+
47
+ # Dict info
48
+ U70:%x[0,13]
49
+ U71:%x[0,14]
50
+ U72:%x[0,15]
51
+ U73:%x[0,16]
52
+ U74:%x[0,17]
53
+ U75:%x[0,18]
54
+
55
+ # Possible editor?
56
+ U80:%x[0,19]
57
+
58
+ # Position
59
+ U90:%x[0,20]
60
+
61
+ # Punctuation
62
+ UA0:%x[0,21]
63
+
64
+ # possible chapter?
65
+ U90:%x[-1,23]/%x[0,22]/%x[0,19]
66
+ U91:%x[-1,23]/%x[0,22]/%x[1,11]
67
+
68
+ # part of speech
69
+ UP0:%x[0,23]
70
+ UP1:%x[-1,23]
71
+ UP2:%x[1,23]
72
+ UP3:%x[-1,23]/%x[0,23]
73
+ UP4:%x[0,23]/%x[1,23]
74
+
75
+ # Bigram
76
+ B