opener-opinion-detector-basic 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +30 -0
- data/bin/opinion-detector-basic +19 -0
- data/bin/opinion-detector-basic-server +10 -0
- data/config.ru +4 -0
- data/core/opinion_detector_basic_multi.py +499 -0
- data/core/packages/KafNafParser-1.3.tar.gz +0 -0
- data/core/packages/VUA_pylib-1.4.tar.gz +0 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
- data/core/vendor/src/crfsuite/AUTHORS +1 -0
- data/core/vendor/src/crfsuite/COPYING +27 -0
- data/core/vendor/src/crfsuite/ChangeLog +103 -0
- data/core/vendor/src/crfsuite/INSTALL +236 -0
- data/core/vendor/src/crfsuite/Makefile.am +19 -0
- data/core/vendor/src/crfsuite/Makefile.in +783 -0
- data/core/vendor/src/crfsuite/README +183 -0
- data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
- data/core/vendor/src/crfsuite/autogen.sh +38 -0
- data/core/vendor/src/crfsuite/compile +143 -0
- data/core/vendor/src/crfsuite/config.guess +1502 -0
- data/core/vendor/src/crfsuite/config.h.in +198 -0
- data/core/vendor/src/crfsuite/config.sub +1714 -0
- data/core/vendor/src/crfsuite/configure +14273 -0
- data/core/vendor/src/crfsuite/configure.in +149 -0
- data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
- data/core/vendor/src/crfsuite/depcomp +630 -0
- data/core/vendor/src/crfsuite/example/chunking.py +49 -0
- data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
- data/core/vendor/src/crfsuite/example/ner.py +270 -0
- data/core/vendor/src/crfsuite/example/pos.py +78 -0
- data/core/vendor/src/crfsuite/example/template.py +88 -0
- data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
- data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
- data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
- data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
- data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
- data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
- data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
- data/core/vendor/src/crfsuite/frontend/main.c +137 -0
- data/core/vendor/src/crfsuite/frontend/option.c +93 -0
- data/core/vendor/src/crfsuite/frontend/option.h +86 -0
- data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
- data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
- data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
- data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
- data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
- data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
- data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
- data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
- data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
- data/core/vendor/src/crfsuite/include/os.h +61 -0
- data/core/vendor/src/crfsuite/install-sh +520 -0
- data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
- data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
- data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
- data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
- data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
- data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
- data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
- data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
- data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
- data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
- data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
- data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
- data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
- data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
- data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
- data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
- data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
- data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
- data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
- data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
- data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
- data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
- data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
- data/core/vendor/src/crfsuite/missing +376 -0
- data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
- data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
- data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
- data/core/vendor/src/crfsuite/swig/export.i +32 -0
- data/core/vendor/src/crfsuite/swig/python/README +92 -0
- data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
- data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
- data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
- data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
- data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
- data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
- data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
- data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
- data/core/vendor/src/liblbfgs/AUTHORS +1 -0
- data/core/vendor/src/liblbfgs/COPYING +22 -0
- data/core/vendor/src/liblbfgs/ChangeLog +120 -0
- data/core/vendor/src/liblbfgs/INSTALL +231 -0
- data/core/vendor/src/liblbfgs/Makefile.am +10 -0
- data/core/vendor/src/liblbfgs/Makefile.in +638 -0
- data/core/vendor/src/liblbfgs/NEWS +0 -0
- data/core/vendor/src/liblbfgs/README +71 -0
- data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
- data/core/vendor/src/liblbfgs/autogen.sh +38 -0
- data/core/vendor/src/liblbfgs/config.guess +1411 -0
- data/core/vendor/src/liblbfgs/config.h.in +64 -0
- data/core/vendor/src/liblbfgs/config.sub +1500 -0
- data/core/vendor/src/liblbfgs/configure +21146 -0
- data/core/vendor/src/liblbfgs/configure.in +107 -0
- data/core/vendor/src/liblbfgs/depcomp +522 -0
- data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
- data/core/vendor/src/liblbfgs/install-sh +322 -0
- data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
- data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
- data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
- data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
- data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
- data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
- data/core/vendor/src/liblbfgs/missing +353 -0
- data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
- data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
- data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
- data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
- data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
- data/core/vendor/src/svm_light/LICENSE.txt +59 -0
- data/core/vendor/src/svm_light/Makefile +105 -0
- data/core/vendor/src/svm_light/kernel.h +40 -0
- data/core/vendor/src/svm_light/svm_classify.c +197 -0
- data/core/vendor/src/svm_light/svm_common.c +985 -0
- data/core/vendor/src/svm_light/svm_common.h +301 -0
- data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
- data/core/vendor/src/svm_light/svm_learn.c +4147 -0
- data/core/vendor/src/svm_light/svm_learn.h +169 -0
- data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
- data/core/vendor/src/svm_light/svm_loqo.c +211 -0
- data/ext/hack/Rakefile +17 -0
- data/ext/hack/support.rb +88 -0
- data/lib/opener/opinion_detector_basic.rb +91 -0
- data/lib/opener/opinion_detector_basic/public/markdown.css +284 -0
- data/lib/opener/opinion_detector_basic/server.rb +16 -0
- data/lib/opener/opinion_detector_basic/version.rb +5 -0
- data/lib/opener/opinion_detector_basic/views/index.erb +97 -0
- data/lib/opener/opinion_detector_basic/views/result.erb +15 -0
- data/opener-opinion-detector-basic.gemspec +36 -0
- data/pre_build_requirements.txt +1 -0
- metadata +309 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
A feature extractor for chunking.
|
|
5
|
+
Copyright 2010,2011 Naoaki Okazaki.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Separator of field values.
|
|
9
|
+
separator = ' '
|
|
10
|
+
|
|
11
|
+
# Field names of the input data.
|
|
12
|
+
fields = 'w pos y'
|
|
13
|
+
|
|
14
|
+
# Attribute templates.
|
|
15
|
+
templates = (
|
|
16
|
+
(('w', -2), ),
|
|
17
|
+
(('w', -1), ),
|
|
18
|
+
(('w', 0), ),
|
|
19
|
+
(('w', 1), ),
|
|
20
|
+
(('w', 2), ),
|
|
21
|
+
(('w', -1), ('w', 0)),
|
|
22
|
+
(('w', 0), ('w', 1)),
|
|
23
|
+
(('pos', -2), ),
|
|
24
|
+
(('pos', -1), ),
|
|
25
|
+
(('pos', 0), ),
|
|
26
|
+
(('pos', 1), ),
|
|
27
|
+
(('pos', 2), ),
|
|
28
|
+
(('pos', -2), ('pos', -1)),
|
|
29
|
+
(('pos', -1), ('pos', 0)),
|
|
30
|
+
(('pos', 0), ('pos', 1)),
|
|
31
|
+
(('pos', 1), ('pos', 2)),
|
|
32
|
+
(('pos', -2), ('pos', -1), ('pos', 0)),
|
|
33
|
+
(('pos', -1), ('pos', 0), ('pos', 1)),
|
|
34
|
+
(('pos', 0), ('pos', 1), ('pos', 2)),
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
import crfutils
|
|
39
|
+
|
|
40
|
+
def feature_extractor(X):
|
|
41
|
+
# Apply attribute templates to obtain features (in fact, attributes)
|
|
42
|
+
crfutils.apply_templates(X, templates)
|
|
43
|
+
if X:
|
|
44
|
+
# Append BOS and EOS features manually
|
|
45
|
+
X[0]['F'].append('__BOS__') # BOS feature
|
|
46
|
+
X[-1]['F'].append('__EOS__') # EOS feature
|
|
47
|
+
|
|
48
|
+
if __name__ == '__main__':
|
|
49
|
+
crfutils.main(feature_extractor, fields=fields, sep=separator)
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""
|
|
2
|
+
A miscellaneous utility for sequential labeling.
|
|
3
|
+
Copyright 2010,2011 Naoaki Okazaki.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import optparse
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
def apply_templates(X, templates):
|
|
10
|
+
"""
|
|
11
|
+
Generate features for an item sequence by applying feature templates.
|
|
12
|
+
A feature template consists of a tuple of (name, offset) pairs,
|
|
13
|
+
where name and offset specify a field name and offset from which
|
|
14
|
+
the template extracts a feature value. Generated features are stored
|
|
15
|
+
in the 'F' field of each item in the sequence.
|
|
16
|
+
|
|
17
|
+
@type X: list of mapping objects
|
|
18
|
+
@param X: The item sequence.
|
|
19
|
+
@type template: tuple of (str, int)
|
|
20
|
+
@param template: The feature template.
|
|
21
|
+
"""
|
|
22
|
+
for template in templates:
|
|
23
|
+
name = '|'.join(['%s[%d]' % (f, o) for f, o in template])
|
|
24
|
+
for t in range(len(X)):
|
|
25
|
+
values = []
|
|
26
|
+
for field, offset in template:
|
|
27
|
+
p = t + offset
|
|
28
|
+
if p not in range(len(X)):
|
|
29
|
+
values = []
|
|
30
|
+
break
|
|
31
|
+
values.append(X[p][field])
|
|
32
|
+
if values:
|
|
33
|
+
X[t]['F'].append('%s=%s' % (name, '|'.join(values)))
|
|
34
|
+
|
|
35
|
+
def readiter(fi, names, sep=' '):
|
|
36
|
+
"""
|
|
37
|
+
Return an iterator for item sequences read from a file object.
|
|
38
|
+
This function reads a sequence from a file object L{fi}, and
|
|
39
|
+
yields the sequence as a list of mapping objects. Each line
|
|
40
|
+
(item) from the file object is split by the separator character
|
|
41
|
+
L{sep}. Separated values of the item are named by L{names},
|
|
42
|
+
and stored in a mapping object. Every item has a field 'F' that
|
|
43
|
+
is reserved for storing features.
|
|
44
|
+
|
|
45
|
+
@type fi: file
|
|
46
|
+
@param fi: The file object.
|
|
47
|
+
@type names: tuple
|
|
48
|
+
@param names: The list of field names.
|
|
49
|
+
@type sep: str
|
|
50
|
+
@param sep: The separator character.
|
|
51
|
+
@rtype list of mapping objects
|
|
52
|
+
@return An iterator for sequences.
|
|
53
|
+
"""
|
|
54
|
+
X = []
|
|
55
|
+
for line in fi:
|
|
56
|
+
line = line.strip('\n')
|
|
57
|
+
if not line:
|
|
58
|
+
yield X
|
|
59
|
+
X = []
|
|
60
|
+
else:
|
|
61
|
+
fields = line.split(sep)
|
|
62
|
+
if len(fields) < len(names):
|
|
63
|
+
raise ValueError(
|
|
64
|
+
'Too few fields (%d) for %r\n%s' % (len(fields), names, line))
|
|
65
|
+
item = {'F': []} # 'F' is reserved for features.
|
|
66
|
+
for i in range(len(names)):
|
|
67
|
+
item[names[i]] = fields[i]
|
|
68
|
+
X.append(item)
|
|
69
|
+
|
|
70
|
+
def escape(src):
|
|
71
|
+
"""
|
|
72
|
+
Escape colon characters from feature names.
|
|
73
|
+
|
|
74
|
+
@type src: str
|
|
75
|
+
@param src: A feature name
|
|
76
|
+
@rtype str
|
|
77
|
+
@return The feature name escaped.
|
|
78
|
+
"""
|
|
79
|
+
return src.replace(':', '__COLON__')
|
|
80
|
+
|
|
81
|
+
def output_features(fo, X, field=''):
|
|
82
|
+
"""
|
|
83
|
+
Output features (and reference labels) of a sequence in CRFSuite
|
|
84
|
+
format. For each item in the sequence, this function writes a
|
|
85
|
+
reference label (if L{field} is a non-empty string) and features.
|
|
86
|
+
|
|
87
|
+
@type fo: file
|
|
88
|
+
@param fo: The file object.
|
|
89
|
+
@type X: list of mapping objects
|
|
90
|
+
@param X: The sequence.
|
|
91
|
+
@type field: str
|
|
92
|
+
@param field: The field name of reference labels.
|
|
93
|
+
"""
|
|
94
|
+
for t in range(len(X)):
|
|
95
|
+
if field:
|
|
96
|
+
fo.write('%s' % X[t][field])
|
|
97
|
+
for a in X[t]['F']:
|
|
98
|
+
if isinstance(a, str):
|
|
99
|
+
fo.write('\t%s' % escape(a))
|
|
100
|
+
else:
|
|
101
|
+
fo.write('\t%s:%f' % (escape(a[0]), a[1]))
|
|
102
|
+
fo.write('\n')
|
|
103
|
+
fo.write('\n')
|
|
104
|
+
|
|
105
|
+
def to_crfsuite(X):
|
|
106
|
+
"""
|
|
107
|
+
Convert an item sequence into an object compatible with crfsuite
|
|
108
|
+
Python module.
|
|
109
|
+
|
|
110
|
+
@type X: list of mapping objects
|
|
111
|
+
@param X: The sequence.
|
|
112
|
+
@rtype crfsuite.ItemSequence
|
|
113
|
+
@return The same sequence in crfsuite.ItemSequence type.
|
|
114
|
+
"""
|
|
115
|
+
import crfsuite
|
|
116
|
+
xseq = crfsuite.ItemSequence()
|
|
117
|
+
for x in X:
|
|
118
|
+
item = crfsuite.Item()
|
|
119
|
+
for f in x['F']:
|
|
120
|
+
if isinstance(f, str):
|
|
121
|
+
item.append(crfsuite.Attribute(escape(f)))
|
|
122
|
+
else:
|
|
123
|
+
item.append(crfsuite.Attribute(escape(f[0]), f[1]))
|
|
124
|
+
xseq.append(item)
|
|
125
|
+
return xseq
|
|
126
|
+
|
|
127
|
+
def main(feature_extractor, fields='w pos y', sep=' '):
|
|
128
|
+
fi = sys.stdin
|
|
129
|
+
fo = sys.stdout
|
|
130
|
+
|
|
131
|
+
# Parse the command-line arguments.
|
|
132
|
+
parser = optparse.OptionParser(usage="""usage: %prog [options]
|
|
133
|
+
This utility reads a data set from STDIN, and outputs attributes to STDOUT.
|
|
134
|
+
Each line of a data set must consist of field values separated by SEPARATOR
|
|
135
|
+
characters. The names and order of field values can be specified by -f option.
|
|
136
|
+
The separator character can be specified with -s option. Instead of outputting
|
|
137
|
+
attributes, this utility tags the input data when a model file is specified by
|
|
138
|
+
-t option (CRFsuite Python module must be installed)."""
|
|
139
|
+
)
|
|
140
|
+
parser.add_option(
|
|
141
|
+
'-t', dest='model',
|
|
142
|
+
help='tag the input using the model (requires "crfsuite" module)'
|
|
143
|
+
)
|
|
144
|
+
parser.add_option(
|
|
145
|
+
'-f', dest='fields', default=fields,
|
|
146
|
+
help='specify field names of input data [default: "%default"]'
|
|
147
|
+
)
|
|
148
|
+
parser.add_option(
|
|
149
|
+
'-s', dest='separator', default=sep,
|
|
150
|
+
help='specify the separator of columns of input data [default: "%default"]'
|
|
151
|
+
)
|
|
152
|
+
(options, args) = parser.parse_args()
|
|
153
|
+
|
|
154
|
+
# The fields of input: ('w', 'pos', 'y) by default.
|
|
155
|
+
F = options.fields.split(' ')
|
|
156
|
+
|
|
157
|
+
if not options.model:
|
|
158
|
+
# The generator function readiter() reads a sequence from a
|
|
159
|
+
for X in readiter(fi, F, options.separator):
|
|
160
|
+
feature_extractor(X)
|
|
161
|
+
output_features(fo, X, 'y')
|
|
162
|
+
|
|
163
|
+
else:
|
|
164
|
+
# Create a tagger with an existing model.
|
|
165
|
+
import crfsuite
|
|
166
|
+
tagger = crfsuite.Tagger()
|
|
167
|
+
tagger.open(options.model)
|
|
168
|
+
|
|
169
|
+
# For each sequence from STDIN.
|
|
170
|
+
for X in readiter(fi, F, options.separator):
|
|
171
|
+
# Obtain features.
|
|
172
|
+
feature_extractor(X)
|
|
173
|
+
xseq = to_crfsuite(X)
|
|
174
|
+
yseq = tagger.tag(xseq)
|
|
175
|
+
for t in range(len(X)):
|
|
176
|
+
v = X[t]
|
|
177
|
+
fo.write('\t'.join([v[f] for f in F]))
|
|
178
|
+
fo.write('\t%s\n' % yseq[t])
|
|
179
|
+
fo.write('\n')
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
A feature extractor for named eneity recognition (NER).
|
|
5
|
+
Copyright 2010,2011 Naoaki Okazaki.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Separator of field values.
|
|
9
|
+
separator = ' '
|
|
10
|
+
|
|
11
|
+
# Field names of the input data.
|
|
12
|
+
fields = 'y w pos chk'
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
import crfutils
|
|
16
|
+
|
|
17
|
+
def get_shape(token):
|
|
18
|
+
r = ''
|
|
19
|
+
for c in token:
|
|
20
|
+
if c.isupper():
|
|
21
|
+
r += 'U'
|
|
22
|
+
elif c.islower():
|
|
23
|
+
r += 'L'
|
|
24
|
+
elif c.isdigit():
|
|
25
|
+
r += 'D'
|
|
26
|
+
elif c in ('.', ','):
|
|
27
|
+
r += '.'
|
|
28
|
+
elif c in (';', ':', '?', '!'):
|
|
29
|
+
r += ';'
|
|
30
|
+
elif c in ('+', '-', '*', '/', '=', '|', '_'):
|
|
31
|
+
r += '-'
|
|
32
|
+
elif c in ('(', '{', '[', '<'):
|
|
33
|
+
r += '('
|
|
34
|
+
elif c in (')', '}', ']', '>'):
|
|
35
|
+
r += ')'
|
|
36
|
+
else:
|
|
37
|
+
r += c
|
|
38
|
+
return r
|
|
39
|
+
|
|
40
|
+
def degenerate(src):
|
|
41
|
+
dst = ''
|
|
42
|
+
for c in src:
|
|
43
|
+
if not dst or dst[-1] != c:
|
|
44
|
+
dst += c
|
|
45
|
+
return dst
|
|
46
|
+
|
|
47
|
+
def get_type(token):
|
|
48
|
+
T = (
|
|
49
|
+
'AllUpper', 'AllDigit', 'AllSymbol',
|
|
50
|
+
'AllUpperDigit', 'AllUpperSymbol', 'AllDigitSymbol',
|
|
51
|
+
'AllUpperDigitSymbol',
|
|
52
|
+
'InitUpper',
|
|
53
|
+
'AllLetter',
|
|
54
|
+
'AllAlnum',
|
|
55
|
+
)
|
|
56
|
+
R = set(T)
|
|
57
|
+
if not token:
|
|
58
|
+
return 'EMPTY'
|
|
59
|
+
|
|
60
|
+
for i in range(len(token)):
|
|
61
|
+
c = token[i]
|
|
62
|
+
if c.isupper():
|
|
63
|
+
R.discard('AllDigit')
|
|
64
|
+
R.discard('AllSymbol')
|
|
65
|
+
R.discard('AllDigitSymbol')
|
|
66
|
+
elif c.isdigit() or c in (',', '.'):
|
|
67
|
+
R.discard('AllUpper')
|
|
68
|
+
R.discard('AllSymbol')
|
|
69
|
+
R.discard('AllUpperSymbol')
|
|
70
|
+
R.discard('AllLetter')
|
|
71
|
+
elif c.islower():
|
|
72
|
+
R.discard('AllUpper')
|
|
73
|
+
R.discard('AllDigit')
|
|
74
|
+
R.discard('AllSymbol')
|
|
75
|
+
R.discard('AllUpperDigit')
|
|
76
|
+
R.discard('AllUpperSymbol')
|
|
77
|
+
R.discard('AllDigitSymbol')
|
|
78
|
+
R.discard('AllUpperDigitSymbol')
|
|
79
|
+
else:
|
|
80
|
+
R.discard('AllUpper')
|
|
81
|
+
R.discard('AllDigit')
|
|
82
|
+
R.discard('AllUpperDigit')
|
|
83
|
+
R.discard('AllLetter')
|
|
84
|
+
R.discard('AllAlnum')
|
|
85
|
+
|
|
86
|
+
if i == 0 and not c.isupper():
|
|
87
|
+
R.discard('InitUpper')
|
|
88
|
+
|
|
89
|
+
for tag in T:
|
|
90
|
+
if tag in R:
|
|
91
|
+
return tag
|
|
92
|
+
return 'NO'
|
|
93
|
+
|
|
94
|
+
def get_2d(token):
|
|
95
|
+
return len(token) == 2 and token.isdigit()
|
|
96
|
+
|
|
97
|
+
def get_4d(token):
|
|
98
|
+
return len(token) == 4 and token.isdigit()
|
|
99
|
+
|
|
100
|
+
def get_da(token):
|
|
101
|
+
bd = False
|
|
102
|
+
ba = False
|
|
103
|
+
for c in token:
|
|
104
|
+
if c.isdigit():
|
|
105
|
+
bd = True
|
|
106
|
+
elif c.isalpha():
|
|
107
|
+
ba = True
|
|
108
|
+
else:
|
|
109
|
+
return False
|
|
110
|
+
return bd and ba
|
|
111
|
+
|
|
112
|
+
def get_dand(token, p):
|
|
113
|
+
bd = False
|
|
114
|
+
bdd = False
|
|
115
|
+
for c in token:
|
|
116
|
+
if c.isdigit():
|
|
117
|
+
bd = True
|
|
118
|
+
elif c == p:
|
|
119
|
+
bdd = True
|
|
120
|
+
else:
|
|
121
|
+
return False
|
|
122
|
+
return bd and bdd
|
|
123
|
+
|
|
124
|
+
def get_all_other(token):
|
|
125
|
+
for c in token:
|
|
126
|
+
if c.isalnum():
|
|
127
|
+
return False
|
|
128
|
+
return True
|
|
129
|
+
|
|
130
|
+
def get_capperiod(token):
|
|
131
|
+
return len(token) == 2 and token[0].isupper() and token[1] == '.'
|
|
132
|
+
|
|
133
|
+
def contains_upper(token):
|
|
134
|
+
b = False
|
|
135
|
+
for c in token:
|
|
136
|
+
b |= c.isupper()
|
|
137
|
+
return b
|
|
138
|
+
|
|
139
|
+
def contains_lower(token):
|
|
140
|
+
b = False
|
|
141
|
+
for c in token:
|
|
142
|
+
b |= c.islower()
|
|
143
|
+
return b
|
|
144
|
+
|
|
145
|
+
def contains_alpha(token):
|
|
146
|
+
b = False
|
|
147
|
+
for c in token:
|
|
148
|
+
b |= c.isalpha()
|
|
149
|
+
return b
|
|
150
|
+
|
|
151
|
+
def contains_digit(token):
|
|
152
|
+
b = False
|
|
153
|
+
for c in token:
|
|
154
|
+
b |= c.isdigit()
|
|
155
|
+
return b
|
|
156
|
+
|
|
157
|
+
def contains_symbol(token):
|
|
158
|
+
b = False
|
|
159
|
+
for c in token:
|
|
160
|
+
b |= ~c.isalnum()
|
|
161
|
+
return b
|
|
162
|
+
|
|
163
|
+
def b(v):
|
|
164
|
+
return 'yes' if v else 'no'
|
|
165
|
+
|
|
166
|
+
def observation(v, defval=''):
|
|
167
|
+
# Lowercased token.
|
|
168
|
+
v['wl'] = v['w'].lower()
|
|
169
|
+
# Token shape.
|
|
170
|
+
v['shape'] = get_shape(v['w'])
|
|
171
|
+
# Token shape degenerated.
|
|
172
|
+
v['shaped'] = degenerate(v['shape'])
|
|
173
|
+
# Token type.
|
|
174
|
+
v['type'] = get_type(v['w'])
|
|
175
|
+
|
|
176
|
+
# Prefixes (length between one to four).
|
|
177
|
+
v['p1'] = v['w'][0] if len(v['w']) >= 1 else defval
|
|
178
|
+
v['p2'] = v['w'][:2] if len(v['w']) >= 2 else defval
|
|
179
|
+
v['p3'] = v['w'][:3] if len(v['w']) >= 3 else defval
|
|
180
|
+
v['p4'] = v['w'][:4] if len(v['w']) >= 4 else defval
|
|
181
|
+
|
|
182
|
+
# Suffixes (length between one to four).
|
|
183
|
+
v['s1'] = v['w'][-1] if len(v['w']) >= 1 else defval
|
|
184
|
+
v['s2'] = v['w'][-2:] if len(v['w']) >= 2 else defval
|
|
185
|
+
v['s3'] = v['w'][-3:] if len(v['w']) >= 3 else defval
|
|
186
|
+
v['s4'] = v['w'][-4:] if len(v['w']) >= 4 else defval
|
|
187
|
+
|
|
188
|
+
# Two digits
|
|
189
|
+
v['2d'] = b(get_2d(v['w']))
|
|
190
|
+
# Four digits.
|
|
191
|
+
v['4d'] = b(get_4d(v['w']))
|
|
192
|
+
# Alphanumeric token.
|
|
193
|
+
v['d&a'] = b(get_da(v['w']))
|
|
194
|
+
# Digits and '-'.
|
|
195
|
+
v['d&-'] = b(get_dand(v['w'], '-'))
|
|
196
|
+
# Digits and '/'.
|
|
197
|
+
v['d&/'] = b(get_dand(v['w'], '/'))
|
|
198
|
+
# Digits and ','.
|
|
199
|
+
v['d&,'] = b(get_dand(v['w'], ','))
|
|
200
|
+
# Digits and '.'.
|
|
201
|
+
v['d&.'] = b(get_dand(v['w'], '.'))
|
|
202
|
+
# A uppercase letter followed by '.'
|
|
203
|
+
v['up'] = b(get_capperiod(v['w']))
|
|
204
|
+
|
|
205
|
+
# An initial uppercase letter.
|
|
206
|
+
v['iu'] = b(v['w'] and v['w'][0].isupper())
|
|
207
|
+
# All uppercase letters.
|
|
208
|
+
v['au'] = b(v['w'].isupper())
|
|
209
|
+
# All lowercase letters.
|
|
210
|
+
v['al'] = b(v['w'].islower())
|
|
211
|
+
# All digit letters.
|
|
212
|
+
v['ad'] = b(v['w'].isdigit())
|
|
213
|
+
# All other (non-alphanumeric) letters.
|
|
214
|
+
v['ao'] = b(get_all_other(v['w']))
|
|
215
|
+
|
|
216
|
+
# Contains a uppercase letter.
|
|
217
|
+
v['cu'] = b(contains_upper(v['w']))
|
|
218
|
+
# Contains a lowercase letter.
|
|
219
|
+
v['cl'] = b(contains_lower(v['w']))
|
|
220
|
+
# Contains a alphabet letter.
|
|
221
|
+
v['ca'] = b(contains_alpha(v['w']))
|
|
222
|
+
# Contains a digit.
|
|
223
|
+
v['cd'] = b(contains_digit(v['w']))
|
|
224
|
+
# Contains a symbol.
|
|
225
|
+
v['cs'] = b(contains_symbol(v['w']))
|
|
226
|
+
|
|
227
|
+
def disjunctive(X, t, field, begin, end):
|
|
228
|
+
name = '%s[%d..%d]' % (field, begin, end)
|
|
229
|
+
for offset in range(begin, end+1):
|
|
230
|
+
p = t + offset
|
|
231
|
+
if p not in range(0, len(X)):
|
|
232
|
+
continue
|
|
233
|
+
X[t]['F'].append('%s=%s' % (name, X[p][field]))
|
|
234
|
+
|
|
235
|
+
U = [
|
|
236
|
+
'w', 'wl', 'pos', 'chk', 'shape', 'shaped', 'type',
|
|
237
|
+
'p1', 'p2', 'p3', 'p4',
|
|
238
|
+
's1', 's2', 's3', 's4',
|
|
239
|
+
'2d', '4d', 'd&a', 'd&-', 'd&/', 'd&,', 'd&.', 'up',
|
|
240
|
+
'iu', 'au', 'al', 'ad', 'ao',
|
|
241
|
+
'cu', 'cl', 'ca', 'cd', 'cs',
|
|
242
|
+
]
|
|
243
|
+
B = ['w', 'pos', 'chk', 'shaped', 'type']
|
|
244
|
+
|
|
245
|
+
templates = []
|
|
246
|
+
for name in U:
|
|
247
|
+
templates += [((name, i),) for i in range(-2, 3)]
|
|
248
|
+
for name in B:
|
|
249
|
+
templates += [((name, i), (name, i+1)) for i in range(-2, 2)]
|
|
250
|
+
|
|
251
|
+
def feature_extractor(X):
|
|
252
|
+
# Append observations.
|
|
253
|
+
for x in X:
|
|
254
|
+
observation(x)
|
|
255
|
+
|
|
256
|
+
# Apply the feature templates.
|
|
257
|
+
crfutils.apply_templates(X, templates)
|
|
258
|
+
|
|
259
|
+
# Append disjunctive features.
|
|
260
|
+
for t in range(len(X)):
|
|
261
|
+
disjunctive(X, t, 'w', -4, -1)
|
|
262
|
+
disjunctive(X, t, 'w', 1, 4)
|
|
263
|
+
|
|
264
|
+
# Append BOS and EOS features.
|
|
265
|
+
if X:
|
|
266
|
+
X[0]['F'].append('__BOS__')
|
|
267
|
+
X[-1]['F'].append('__EOS__')
|
|
268
|
+
|
|
269
|
+
if __name__ == '__main__':
|
|
270
|
+
crfutils.main(feature_extractor, fields=fields, sep=separator)
|