opener-opinion-detector-basic 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +30 -0
  3. data/bin/opinion-detector-basic +19 -0
  4. data/bin/opinion-detector-basic-server +10 -0
  5. data/config.ru +4 -0
  6. data/core/opinion_detector_basic_multi.py +499 -0
  7. data/core/packages/KafNafParser-1.3.tar.gz +0 -0
  8. data/core/packages/VUA_pylib-1.4.tar.gz +0 -0
  9. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  10. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  11. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  12. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  13. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  14. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  15. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  16. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  17. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  18. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  19. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  20. data/core/vendor/src/crfsuite/AUTHORS +1 -0
  21. data/core/vendor/src/crfsuite/COPYING +27 -0
  22. data/core/vendor/src/crfsuite/ChangeLog +103 -0
  23. data/core/vendor/src/crfsuite/INSTALL +236 -0
  24. data/core/vendor/src/crfsuite/Makefile.am +19 -0
  25. data/core/vendor/src/crfsuite/Makefile.in +783 -0
  26. data/core/vendor/src/crfsuite/README +183 -0
  27. data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
  28. data/core/vendor/src/crfsuite/autogen.sh +38 -0
  29. data/core/vendor/src/crfsuite/compile +143 -0
  30. data/core/vendor/src/crfsuite/config.guess +1502 -0
  31. data/core/vendor/src/crfsuite/config.h.in +198 -0
  32. data/core/vendor/src/crfsuite/config.sub +1714 -0
  33. data/core/vendor/src/crfsuite/configure +14273 -0
  34. data/core/vendor/src/crfsuite/configure.in +149 -0
  35. data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
  36. data/core/vendor/src/crfsuite/depcomp +630 -0
  37. data/core/vendor/src/crfsuite/example/chunking.py +49 -0
  38. data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
  39. data/core/vendor/src/crfsuite/example/ner.py +270 -0
  40. data/core/vendor/src/crfsuite/example/pos.py +78 -0
  41. data/core/vendor/src/crfsuite/example/template.py +88 -0
  42. data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
  43. data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
  44. data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
  45. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
  46. data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
  47. data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
  48. data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
  49. data/core/vendor/src/crfsuite/frontend/main.c +137 -0
  50. data/core/vendor/src/crfsuite/frontend/option.c +93 -0
  51. data/core/vendor/src/crfsuite/frontend/option.h +86 -0
  52. data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
  53. data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
  54. data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
  55. data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
  56. data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
  57. data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
  58. data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
  59. data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
  60. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
  61. data/core/vendor/src/crfsuite/include/os.h +61 -0
  62. data/core/vendor/src/crfsuite/install-sh +520 -0
  63. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
  64. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
  65. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
  66. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
  67. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
  68. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
  69. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
  70. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
  71. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
  72. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
  73. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
  74. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
  75. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
  76. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
  77. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
  78. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
  79. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
  80. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
  81. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
  82. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
  83. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
  84. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
  85. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
  86. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
  87. data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
  88. data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
  89. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
  90. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
  91. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
  92. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
  93. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
  94. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
  95. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
  96. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
  97. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
  98. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
  99. data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
  100. data/core/vendor/src/crfsuite/missing +376 -0
  101. data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
  102. data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
  103. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
  104. data/core/vendor/src/crfsuite/swig/export.i +32 -0
  105. data/core/vendor/src/crfsuite/swig/python/README +92 -0
  106. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
  107. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
  108. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
  109. data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
  110. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
  111. data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
  112. data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
  113. data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
  114. data/core/vendor/src/liblbfgs/AUTHORS +1 -0
  115. data/core/vendor/src/liblbfgs/COPYING +22 -0
  116. data/core/vendor/src/liblbfgs/ChangeLog +120 -0
  117. data/core/vendor/src/liblbfgs/INSTALL +231 -0
  118. data/core/vendor/src/liblbfgs/Makefile.am +10 -0
  119. data/core/vendor/src/liblbfgs/Makefile.in +638 -0
  120. data/core/vendor/src/liblbfgs/NEWS +0 -0
  121. data/core/vendor/src/liblbfgs/README +71 -0
  122. data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
  123. data/core/vendor/src/liblbfgs/autogen.sh +38 -0
  124. data/core/vendor/src/liblbfgs/config.guess +1411 -0
  125. data/core/vendor/src/liblbfgs/config.h.in +64 -0
  126. data/core/vendor/src/liblbfgs/config.sub +1500 -0
  127. data/core/vendor/src/liblbfgs/configure +21146 -0
  128. data/core/vendor/src/liblbfgs/configure.in +107 -0
  129. data/core/vendor/src/liblbfgs/depcomp +522 -0
  130. data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
  131. data/core/vendor/src/liblbfgs/install-sh +322 -0
  132. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
  133. data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
  134. data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
  135. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
  136. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
  137. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
  138. data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
  139. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
  140. data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
  141. data/core/vendor/src/liblbfgs/missing +353 -0
  142. data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
  143. data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
  144. data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
  145. data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
  146. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
  147. data/core/vendor/src/svm_light/LICENSE.txt +59 -0
  148. data/core/vendor/src/svm_light/Makefile +105 -0
  149. data/core/vendor/src/svm_light/kernel.h +40 -0
  150. data/core/vendor/src/svm_light/svm_classify.c +197 -0
  151. data/core/vendor/src/svm_light/svm_common.c +985 -0
  152. data/core/vendor/src/svm_light/svm_common.h +301 -0
  153. data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
  154. data/core/vendor/src/svm_light/svm_learn.c +4147 -0
  155. data/core/vendor/src/svm_light/svm_learn.h +169 -0
  156. data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
  157. data/core/vendor/src/svm_light/svm_loqo.c +211 -0
  158. data/ext/hack/Rakefile +17 -0
  159. data/ext/hack/support.rb +88 -0
  160. data/lib/opener/opinion_detector_basic.rb +91 -0
  161. data/lib/opener/opinion_detector_basic/public/markdown.css +284 -0
  162. data/lib/opener/opinion_detector_basic/server.rb +16 -0
  163. data/lib/opener/opinion_detector_basic/version.rb +5 -0
  164. data/lib/opener/opinion_detector_basic/views/index.erb +97 -0
  165. data/lib/opener/opinion_detector_basic/views/result.erb +15 -0
  166. data/opener-opinion-detector-basic.gemspec +36 -0
  167. data/pre_build_requirements.txt +1 -0
  168. metadata +309 -0
@@ -0,0 +1,49 @@
1
+ #!/usr/bin/env python
2
+
3
+ """
4
+ A feature extractor for chunking.
5
+ Copyright 2010,2011 Naoaki Okazaki.
6
+ """
7
+
8
+ # Separator of field values.
9
+ separator = ' '
10
+
11
+ # Field names of the input data.
12
+ fields = 'w pos y'
13
+
14
+ # Attribute templates.
15
+ templates = (
16
+ (('w', -2), ),
17
+ (('w', -1), ),
18
+ (('w', 0), ),
19
+ (('w', 1), ),
20
+ (('w', 2), ),
21
+ (('w', -1), ('w', 0)),
22
+ (('w', 0), ('w', 1)),
23
+ (('pos', -2), ),
24
+ (('pos', -1), ),
25
+ (('pos', 0), ),
26
+ (('pos', 1), ),
27
+ (('pos', 2), ),
28
+ (('pos', -2), ('pos', -1)),
29
+ (('pos', -1), ('pos', 0)),
30
+ (('pos', 0), ('pos', 1)),
31
+ (('pos', 1), ('pos', 2)),
32
+ (('pos', -2), ('pos', -1), ('pos', 0)),
33
+ (('pos', -1), ('pos', 0), ('pos', 1)),
34
+ (('pos', 0), ('pos', 1), ('pos', 2)),
35
+ )
36
+
37
+
38
+ import crfutils
39
+
40
+ def feature_extractor(X):
41
+ # Apply attribute templates to obtain features (in fact, attributes)
42
+ crfutils.apply_templates(X, templates)
43
+ if X:
44
+ # Append BOS and EOS features manually
45
+ X[0]['F'].append('__BOS__') # BOS feature
46
+ X[-1]['F'].append('__EOS__') # EOS feature
47
+
48
+ if __name__ == '__main__':
49
+ crfutils.main(feature_extractor, fields=fields, sep=separator)
@@ -0,0 +1,179 @@
1
+ """
2
+ A miscellaneous utility for sequential labeling.
3
+ Copyright 2010,2011 Naoaki Okazaki.
4
+ """
5
+
6
+ import optparse
7
+ import sys
8
+
9
+ def apply_templates(X, templates):
10
+ """
11
+ Generate features for an item sequence by applying feature templates.
12
+ A feature template consists of a tuple of (name, offset) pairs,
13
+ where name and offset specify a field name and offset from which
14
+ the template extracts a feature value. Generated features are stored
15
+ in the 'F' field of each item in the sequence.
16
+
17
+ @type X: list of mapping objects
18
+ @param X: The item sequence.
19
+ @type template: tuple of (str, int)
20
+ @param template: The feature template.
21
+ """
22
+ for template in templates:
23
+ name = '|'.join(['%s[%d]' % (f, o) for f, o in template])
24
+ for t in range(len(X)):
25
+ values = []
26
+ for field, offset in template:
27
+ p = t + offset
28
+ if p not in range(len(X)):
29
+ values = []
30
+ break
31
+ values.append(X[p][field])
32
+ if values:
33
+ X[t]['F'].append('%s=%s' % (name, '|'.join(values)))
34
+
35
+ def readiter(fi, names, sep=' '):
36
+ """
37
+ Return an iterator for item sequences read from a file object.
38
+ This function reads a sequence from a file object L{fi}, and
39
+ yields the sequence as a list of mapping objects. Each line
40
+ (item) from the file object is split by the separator character
41
+ L{sep}. Separated values of the item are named by L{names},
42
+ and stored in a mapping object. Every item has a field 'F' that
43
+ is reserved for storing features.
44
+
45
+ @type fi: file
46
+ @param fi: The file object.
47
+ @type names: tuple
48
+ @param names: The list of field names.
49
+ @type sep: str
50
+ @param sep: The separator character.
51
+ @rtype list of mapping objects
52
+ @return An iterator for sequences.
53
+ """
54
+ X = []
55
+ for line in fi:
56
+ line = line.strip('\n')
57
+ if not line:
58
+ yield X
59
+ X = []
60
+ else:
61
+ fields = line.split(sep)
62
+ if len(fields) < len(names):
63
+ raise ValueError(
64
+ 'Too few fields (%d) for %r\n%s' % (len(fields), names, line))
65
+ item = {'F': []} # 'F' is reserved for features.
66
+ for i in range(len(names)):
67
+ item[names[i]] = fields[i]
68
+ X.append(item)
69
+
70
+ def escape(src):
71
+ """
72
+ Escape colon characters from feature names.
73
+
74
+ @type src: str
75
+ @param src: A feature name
76
+ @rtype str
77
+ @return The feature name escaped.
78
+ """
79
+ return src.replace(':', '__COLON__')
80
+
81
+ def output_features(fo, X, field=''):
82
+ """
83
+ Output features (and reference labels) of a sequence in CRFSuite
84
+ format. For each item in the sequence, this function writes a
85
+ reference label (if L{field} is a non-empty string) and features.
86
+
87
+ @type fo: file
88
+ @param fo: The file object.
89
+ @type X: list of mapping objects
90
+ @param X: The sequence.
91
+ @type field: str
92
+ @param field: The field name of reference labels.
93
+ """
94
+ for t in range(len(X)):
95
+ if field:
96
+ fo.write('%s' % X[t][field])
97
+ for a in X[t]['F']:
98
+ if isinstance(a, str):
99
+ fo.write('\t%s' % escape(a))
100
+ else:
101
+ fo.write('\t%s:%f' % (escape(a[0]), a[1]))
102
+ fo.write('\n')
103
+ fo.write('\n')
104
+
105
+ def to_crfsuite(X):
106
+ """
107
+ Convert an item sequence into an object compatible with crfsuite
108
+ Python module.
109
+
110
+ @type X: list of mapping objects
111
+ @param X: The sequence.
112
+ @rtype crfsuite.ItemSequence
113
+ @return The same sequence in crfsuite.ItemSequence type.
114
+ """
115
+ import crfsuite
116
+ xseq = crfsuite.ItemSequence()
117
+ for x in X:
118
+ item = crfsuite.Item()
119
+ for f in x['F']:
120
+ if isinstance(f, str):
121
+ item.append(crfsuite.Attribute(escape(f)))
122
+ else:
123
+ item.append(crfsuite.Attribute(escape(f[0]), f[1]))
124
+ xseq.append(item)
125
+ return xseq
126
+
127
+ def main(feature_extractor, fields='w pos y', sep=' '):
128
+ fi = sys.stdin
129
+ fo = sys.stdout
130
+
131
+ # Parse the command-line arguments.
132
+ parser = optparse.OptionParser(usage="""usage: %prog [options]
133
+ This utility reads a data set from STDIN, and outputs attributes to STDOUT.
134
+ Each line of a data set must consist of field values separated by SEPARATOR
135
+ characters. The names and order of field values can be specified by -f option.
136
+ The separator character can be specified with -s option. Instead of outputting
137
+ attributes, this utility tags the input data when a model file is specified by
138
+ -t option (CRFsuite Python module must be installed)."""
139
+ )
140
+ parser.add_option(
141
+ '-t', dest='model',
142
+ help='tag the input using the model (requires "crfsuite" module)'
143
+ )
144
+ parser.add_option(
145
+ '-f', dest='fields', default=fields,
146
+ help='specify field names of input data [default: "%default"]'
147
+ )
148
+ parser.add_option(
149
+ '-s', dest='separator', default=sep,
150
+ help='specify the separator of columns of input data [default: "%default"]'
151
+ )
152
+ (options, args) = parser.parse_args()
153
+
154
+ # The fields of input: ('w', 'pos', 'y) by default.
155
+ F = options.fields.split(' ')
156
+
157
+ if not options.model:
158
+ # The generator function readiter() reads a sequence from a
159
+ for X in readiter(fi, F, options.separator):
160
+ feature_extractor(X)
161
+ output_features(fo, X, 'y')
162
+
163
+ else:
164
+ # Create a tagger with an existing model.
165
+ import crfsuite
166
+ tagger = crfsuite.Tagger()
167
+ tagger.open(options.model)
168
+
169
+ # For each sequence from STDIN.
170
+ for X in readiter(fi, F, options.separator):
171
+ # Obtain features.
172
+ feature_extractor(X)
173
+ xseq = to_crfsuite(X)
174
+ yseq = tagger.tag(xseq)
175
+ for t in range(len(X)):
176
+ v = X[t]
177
+ fo.write('\t'.join([v[f] for f in F]))
178
+ fo.write('\t%s\n' % yseq[t])
179
+ fo.write('\n')
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env python
2
+
3
+ """
4
+ A feature extractor for named eneity recognition (NER).
5
+ Copyright 2010,2011 Naoaki Okazaki.
6
+ """
7
+
8
+ # Separator of field values.
9
+ separator = ' '
10
+
11
+ # Field names of the input data.
12
+ fields = 'y w pos chk'
13
+
14
+
15
+ import crfutils
16
+
17
+ def get_shape(token):
18
+ r = ''
19
+ for c in token:
20
+ if c.isupper():
21
+ r += 'U'
22
+ elif c.islower():
23
+ r += 'L'
24
+ elif c.isdigit():
25
+ r += 'D'
26
+ elif c in ('.', ','):
27
+ r += '.'
28
+ elif c in (';', ':', '?', '!'):
29
+ r += ';'
30
+ elif c in ('+', '-', '*', '/', '=', '|', '_'):
31
+ r += '-'
32
+ elif c in ('(', '{', '[', '<'):
33
+ r += '('
34
+ elif c in (')', '}', ']', '>'):
35
+ r += ')'
36
+ else:
37
+ r += c
38
+ return r
39
+
40
+ def degenerate(src):
41
+ dst = ''
42
+ for c in src:
43
+ if not dst or dst[-1] != c:
44
+ dst += c
45
+ return dst
46
+
47
+ def get_type(token):
48
+ T = (
49
+ 'AllUpper', 'AllDigit', 'AllSymbol',
50
+ 'AllUpperDigit', 'AllUpperSymbol', 'AllDigitSymbol',
51
+ 'AllUpperDigitSymbol',
52
+ 'InitUpper',
53
+ 'AllLetter',
54
+ 'AllAlnum',
55
+ )
56
+ R = set(T)
57
+ if not token:
58
+ return 'EMPTY'
59
+
60
+ for i in range(len(token)):
61
+ c = token[i]
62
+ if c.isupper():
63
+ R.discard('AllDigit')
64
+ R.discard('AllSymbol')
65
+ R.discard('AllDigitSymbol')
66
+ elif c.isdigit() or c in (',', '.'):
67
+ R.discard('AllUpper')
68
+ R.discard('AllSymbol')
69
+ R.discard('AllUpperSymbol')
70
+ R.discard('AllLetter')
71
+ elif c.islower():
72
+ R.discard('AllUpper')
73
+ R.discard('AllDigit')
74
+ R.discard('AllSymbol')
75
+ R.discard('AllUpperDigit')
76
+ R.discard('AllUpperSymbol')
77
+ R.discard('AllDigitSymbol')
78
+ R.discard('AllUpperDigitSymbol')
79
+ else:
80
+ R.discard('AllUpper')
81
+ R.discard('AllDigit')
82
+ R.discard('AllUpperDigit')
83
+ R.discard('AllLetter')
84
+ R.discard('AllAlnum')
85
+
86
+ if i == 0 and not c.isupper():
87
+ R.discard('InitUpper')
88
+
89
+ for tag in T:
90
+ if tag in R:
91
+ return tag
92
+ return 'NO'
93
+
94
+ def get_2d(token):
95
+ return len(token) == 2 and token.isdigit()
96
+
97
+ def get_4d(token):
98
+ return len(token) == 4 and token.isdigit()
99
+
100
+ def get_da(token):
101
+ bd = False
102
+ ba = False
103
+ for c in token:
104
+ if c.isdigit():
105
+ bd = True
106
+ elif c.isalpha():
107
+ ba = True
108
+ else:
109
+ return False
110
+ return bd and ba
111
+
112
+ def get_dand(token, p):
113
+ bd = False
114
+ bdd = False
115
+ for c in token:
116
+ if c.isdigit():
117
+ bd = True
118
+ elif c == p:
119
+ bdd = True
120
+ else:
121
+ return False
122
+ return bd and bdd
123
+
124
+ def get_all_other(token):
125
+ for c in token:
126
+ if c.isalnum():
127
+ return False
128
+ return True
129
+
130
+ def get_capperiod(token):
131
+ return len(token) == 2 and token[0].isupper() and token[1] == '.'
132
+
133
+ def contains_upper(token):
134
+ b = False
135
+ for c in token:
136
+ b |= c.isupper()
137
+ return b
138
+
139
+ def contains_lower(token):
140
+ b = False
141
+ for c in token:
142
+ b |= c.islower()
143
+ return b
144
+
145
+ def contains_alpha(token):
146
+ b = False
147
+ for c in token:
148
+ b |= c.isalpha()
149
+ return b
150
+
151
+ def contains_digit(token):
152
+ b = False
153
+ for c in token:
154
+ b |= c.isdigit()
155
+ return b
156
+
157
+ def contains_symbol(token):
158
+ b = False
159
+ for c in token:
160
+ b |= ~c.isalnum()
161
+ return b
162
+
163
+ def b(v):
164
+ return 'yes' if v else 'no'
165
+
166
+ def observation(v, defval=''):
167
+ # Lowercased token.
168
+ v['wl'] = v['w'].lower()
169
+ # Token shape.
170
+ v['shape'] = get_shape(v['w'])
171
+ # Token shape degenerated.
172
+ v['shaped'] = degenerate(v['shape'])
173
+ # Token type.
174
+ v['type'] = get_type(v['w'])
175
+
176
+ # Prefixes (length between one to four).
177
+ v['p1'] = v['w'][0] if len(v['w']) >= 1 else defval
178
+ v['p2'] = v['w'][:2] if len(v['w']) >= 2 else defval
179
+ v['p3'] = v['w'][:3] if len(v['w']) >= 3 else defval
180
+ v['p4'] = v['w'][:4] if len(v['w']) >= 4 else defval
181
+
182
+ # Suffixes (length between one to four).
183
+ v['s1'] = v['w'][-1] if len(v['w']) >= 1 else defval
184
+ v['s2'] = v['w'][-2:] if len(v['w']) >= 2 else defval
185
+ v['s3'] = v['w'][-3:] if len(v['w']) >= 3 else defval
186
+ v['s4'] = v['w'][-4:] if len(v['w']) >= 4 else defval
187
+
188
+ # Two digits
189
+ v['2d'] = b(get_2d(v['w']))
190
+ # Four digits.
191
+ v['4d'] = b(get_4d(v['w']))
192
+ # Alphanumeric token.
193
+ v['d&a'] = b(get_da(v['w']))
194
+ # Digits and '-'.
195
+ v['d&-'] = b(get_dand(v['w'], '-'))
196
+ # Digits and '/'.
197
+ v['d&/'] = b(get_dand(v['w'], '/'))
198
+ # Digits and ','.
199
+ v['d&,'] = b(get_dand(v['w'], ','))
200
+ # Digits and '.'.
201
+ v['d&.'] = b(get_dand(v['w'], '.'))
202
+ # A uppercase letter followed by '.'
203
+ v['up'] = b(get_capperiod(v['w']))
204
+
205
+ # An initial uppercase letter.
206
+ v['iu'] = b(v['w'] and v['w'][0].isupper())
207
+ # All uppercase letters.
208
+ v['au'] = b(v['w'].isupper())
209
+ # All lowercase letters.
210
+ v['al'] = b(v['w'].islower())
211
+ # All digit letters.
212
+ v['ad'] = b(v['w'].isdigit())
213
+ # All other (non-alphanumeric) letters.
214
+ v['ao'] = b(get_all_other(v['w']))
215
+
216
+ # Contains a uppercase letter.
217
+ v['cu'] = b(contains_upper(v['w']))
218
+ # Contains a lowercase letter.
219
+ v['cl'] = b(contains_lower(v['w']))
220
+ # Contains a alphabet letter.
221
+ v['ca'] = b(contains_alpha(v['w']))
222
+ # Contains a digit.
223
+ v['cd'] = b(contains_digit(v['w']))
224
+ # Contains a symbol.
225
+ v['cs'] = b(contains_symbol(v['w']))
226
+
227
+ def disjunctive(X, t, field, begin, end):
228
+ name = '%s[%d..%d]' % (field, begin, end)
229
+ for offset in range(begin, end+1):
230
+ p = t + offset
231
+ if p not in range(0, len(X)):
232
+ continue
233
+ X[t]['F'].append('%s=%s' % (name, X[p][field]))
234
+
235
+ U = [
236
+ 'w', 'wl', 'pos', 'chk', 'shape', 'shaped', 'type',
237
+ 'p1', 'p2', 'p3', 'p4',
238
+ 's1', 's2', 's3', 's4',
239
+ '2d', '4d', 'd&a', 'd&-', 'd&/', 'd&,', 'd&.', 'up',
240
+ 'iu', 'au', 'al', 'ad', 'ao',
241
+ 'cu', 'cl', 'ca', 'cd', 'cs',
242
+ ]
243
+ B = ['w', 'pos', 'chk', 'shaped', 'type']
244
+
245
+ templates = []
246
+ for name in U:
247
+ templates += [((name, i),) for i in range(-2, 3)]
248
+ for name in B:
249
+ templates += [((name, i), (name, i+1)) for i in range(-2, 2)]
250
+
251
+ def feature_extractor(X):
252
+ # Append observations.
253
+ for x in X:
254
+ observation(x)
255
+
256
+ # Apply the feature templates.
257
+ crfutils.apply_templates(X, templates)
258
+
259
+ # Append disjunctive features.
260
+ for t in range(len(X)):
261
+ disjunctive(X, t, 'w', -4, -1)
262
+ disjunctive(X, t, 'w', 1, 4)
263
+
264
+ # Append BOS and EOS features.
265
+ if X:
266
+ X[0]['F'].append('__BOS__')
267
+ X[-1]['F'].append('__EOS__')
268
+
269
+ if __name__ == '__main__':
270
+ crfutils.main(feature_extractor, fields=fields, sep=separator)