excite 2.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +11 -0
- data/.rspec +1 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +69 -0
- data/LICENSE +22 -0
- data/README.md +46 -0
- data/Rakefile +24 -0
- data/config/citation_cleanup_rules.yml +68 -0
- data/config/parscit_features.yml +55 -0
- data/excite.gemspec +30 -0
- data/lib/excite/array_helpers.rb +27 -0
- data/lib/excite/citation.rb +48 -0
- data/lib/excite/crfparser.rb +322 -0
- data/lib/excite/postprocessor.rb +252 -0
- data/lib/excite/preprocessor.rb +107 -0
- data/lib/excite/resources/dicts/female-names +4954 -0
- data/lib/excite/resources/dicts/first-names +27926 -0
- data/lib/excite/resources/dicts/male-names +3901 -0
- data/lib/excite/resources/dicts/months +24 -0
- data/lib/excite/resources/dicts/places +43109 -0
- data/lib/excite/resources/dicts/publishers +654 -0
- data/lib/excite/resources/dicts/surnames +146259 -0
- data/lib/excite/resources/html.template +84 -0
- data/lib/excite/resources/html_model +0 -0
- data/lib/excite/resources/model +0 -0
- data/lib/excite/resources/parsCit.template +76 -0
- data/lib/excite/resources/trainingdata/tagged_html_references.txt +500 -0
- data/lib/excite/resources/trainingdata/tagged_references.txt +500 -0
- data/lib/excite/resources/trainingdata/verify.rb +97 -0
- data/lib/excite/token_features.rb +313 -0
- data/lib/excite/version.rb +7 -0
- data/lib/excite.rb +13 -0
- data/model/test/analysis.csv +54 -0
- data/model/test/array_helpers.rb +30 -0
- data/model/test/html-analysis.csv +60 -0
- data/model/test/html-output.txt +19893 -0
- data/model/test/model_test.rb +306 -0
- data/model/test/output.txt +16742 -0
- data/spec/excite/citation_spec.rb +128 -0
- data/spec/excite/crfparser_spec.rb +118 -0
- data/spec/excite/postprocessor_spec.rb +68 -0
- data/spec/excite/token_features_spec.rb +641 -0
- data/spec/spec_helper.rb +4 -0
- metadata +222 -0
@@ -0,0 +1,84 @@
|
|
1
|
+
# See config/parscit_features.yml for function-to-number correlation
|
2
|
+
|
3
|
+
b# Unigram
|
4
|
+
U00:%x[-3,0]
|
5
|
+
U01:%x[-2,0]
|
6
|
+
U02:%x[-1,0]
|
7
|
+
U03:%x[0,0]
|
8
|
+
U04:%x[1,0]
|
9
|
+
U05:%x[2,0]
|
10
|
+
U06:%x[3,0]
|
11
|
+
U07:%x[-1,0]/%x[0,0]
|
12
|
+
U08:%x[0,0]/%x[1,0]
|
13
|
+
|
14
|
+
# last Char
|
15
|
+
U10:%x[0,1]
|
16
|
+
U11:%x[-1,1]
|
17
|
+
|
18
|
+
# first 1-4
|
19
|
+
U20:%x[0,2]
|
20
|
+
U21:%x[0,3]
|
21
|
+
U22:%x[0,4]
|
22
|
+
U23:%x[0,5]
|
23
|
+
|
24
|
+
# last 1-4
|
25
|
+
U30:%x[0,6]
|
26
|
+
U31:%x[0,7]
|
27
|
+
U32:%x[0,8]
|
28
|
+
U33:%x[0,9]
|
29
|
+
|
30
|
+
# Lowercased, no punct
|
31
|
+
U40:%x[-2,10]
|
32
|
+
U41:%x[-1,10]
|
33
|
+
U42:%x[0,10]
|
34
|
+
U43:%x[1,10]
|
35
|
+
U44:%x[2,10]
|
36
|
+
|
37
|
+
# Capitalization
|
38
|
+
U50:%x[0,11]
|
39
|
+
|
40
|
+
# Numbers
|
41
|
+
U60:%x[-1,12]
|
42
|
+
U61:%x[0,12]
|
43
|
+
U62:%x[1,12]
|
44
|
+
U63:%x[-1,12]/%x[0,12]
|
45
|
+
U63:%x[0,12]/%x[1,12]
|
46
|
+
|
47
|
+
# Dict info
|
48
|
+
U70:%x[0,13]
|
49
|
+
U71:%x[0,14]
|
50
|
+
U72:%x[0,15]
|
51
|
+
U73:%x[0,16]
|
52
|
+
U74:%x[0,17]
|
53
|
+
U75:%x[0,18]
|
54
|
+
|
55
|
+
# Possible editor?
|
56
|
+
U80:%x[0,19]
|
57
|
+
|
58
|
+
# Position
|
59
|
+
U90:%x[0,20]
|
60
|
+
|
61
|
+
# Punctuation
|
62
|
+
UA0:%x[0,21]
|
63
|
+
|
64
|
+
# possible chapter?
|
65
|
+
U90:%x[-1,25]/%x[0,22]/%x[0,19]
|
66
|
+
U91:%x[-1,25]/%x[0,22]/%x[1,11]
|
67
|
+
|
68
|
+
# html tag
|
69
|
+
UH2:%x[0,23]
|
70
|
+
UH3:%x[1,23]
|
71
|
+
UH4:%x[-1,23]
|
72
|
+
UH5:%x[-1,23]/%x[0,23]
|
73
|
+
UH6:%x[0,23]/%x[1,23]
|
74
|
+
UH7:%x[0,24]
|
75
|
+
|
76
|
+
# part of speech
|
77
|
+
UP0:%x[0,25]
|
78
|
+
UP1:%x[-1,25]
|
79
|
+
UP2:%x[1,25]
|
80
|
+
UP3:%x[-1,25]/%x[0,25]
|
81
|
+
UP4:%x[0,25]/%x[1,25]
|
82
|
+
|
83
|
+
# Bigram
|
84
|
+
B
|
Binary file
|
Binary file
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# See config/parscit_features.yml for function-to-number correlation
|
2
|
+
|
3
|
+
# Unigram
|
4
|
+
U00:%x[-3,0]
|
5
|
+
U01:%x[-2,0]
|
6
|
+
U02:%x[-1,0]
|
7
|
+
U03:%x[0,0]
|
8
|
+
U04:%x[1,0]
|
9
|
+
U05:%x[2,0]
|
10
|
+
U06:%x[3,0]
|
11
|
+
U07:%x[-1,0]/%x[0,0]
|
12
|
+
U08:%x[0,0]/%x[1,0]
|
13
|
+
|
14
|
+
# last Char
|
15
|
+
U10:%x[0,1]
|
16
|
+
U11:%x[-1,1]
|
17
|
+
|
18
|
+
# first 1-4
|
19
|
+
U20:%x[0,2]
|
20
|
+
U21:%x[0,3]
|
21
|
+
U22:%x[0,4]
|
22
|
+
U23:%x[0,5]
|
23
|
+
|
24
|
+
# last 1-4
|
25
|
+
U30:%x[0,6]
|
26
|
+
U31:%x[0,7]
|
27
|
+
U32:%x[0,8]
|
28
|
+
U33:%x[0,9]
|
29
|
+
|
30
|
+
# Lowercased, no punct
|
31
|
+
U40:%x[-2,10]
|
32
|
+
U41:%x[-1,10]
|
33
|
+
U42:%x[0,10]
|
34
|
+
U43:%x[1,10]
|
35
|
+
U44:%x[2,10]
|
36
|
+
|
37
|
+
# Capitalization
|
38
|
+
U50:%x[0,11]
|
39
|
+
|
40
|
+
# Numbers
|
41
|
+
U60:%x[-1,12]
|
42
|
+
U61:%x[0,12]
|
43
|
+
U62:%x[1,12]
|
44
|
+
U63:%x[-1,12]/%x[0,12]
|
45
|
+
U63:%x[0,12]/%x[1,12]
|
46
|
+
|
47
|
+
# Dict info
|
48
|
+
U70:%x[0,13]
|
49
|
+
U71:%x[0,14]
|
50
|
+
U72:%x[0,15]
|
51
|
+
U73:%x[0,16]
|
52
|
+
U74:%x[0,17]
|
53
|
+
U75:%x[0,18]
|
54
|
+
|
55
|
+
# Possible editor?
|
56
|
+
U80:%x[0,19]
|
57
|
+
|
58
|
+
# Position
|
59
|
+
U90:%x[0,20]
|
60
|
+
|
61
|
+
# Punctuation
|
62
|
+
UA0:%x[0,21]
|
63
|
+
|
64
|
+
# possible chapter?
|
65
|
+
U90:%x[-1,23]/%x[0,22]/%x[0,19]
|
66
|
+
U91:%x[-1,23]/%x[0,22]/%x[1,11]
|
67
|
+
|
68
|
+
# part of speech
|
69
|
+
UP0:%x[0,23]
|
70
|
+
UP1:%x[-1,23]
|
71
|
+
UP2:%x[1,23]
|
72
|
+
UP3:%x[-1,23]/%x[0,23]
|
73
|
+
UP4:%x[0,23]/%x[1,23]
|
74
|
+
|
75
|
+
# Bigram
|
76
|
+
B
|