opener-opinion-detector-basic 2.0.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (148) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -0
  3. data/ext/hack/Rakefile +0 -2
  4. data/lib/opener/opinion_detector_basic/version.rb +1 -1
  5. data/opener-opinion-detector-basic.gemspec +0 -1
  6. data/task/compile.rake +1 -1
  7. data/task/requirements.rake +0 -1
  8. metadata +2 -142
  9. data/core/vendor/src/crfsuite/AUTHORS +0 -1
  10. data/core/vendor/src/crfsuite/COPYING +0 -27
  11. data/core/vendor/src/crfsuite/ChangeLog +0 -103
  12. data/core/vendor/src/crfsuite/INSTALL +0 -236
  13. data/core/vendor/src/crfsuite/Makefile.am +0 -19
  14. data/core/vendor/src/crfsuite/Makefile.in +0 -783
  15. data/core/vendor/src/crfsuite/README +0 -183
  16. data/core/vendor/src/crfsuite/aclocal.m4 +0 -9018
  17. data/core/vendor/src/crfsuite/autogen.sh +0 -38
  18. data/core/vendor/src/crfsuite/compile +0 -143
  19. data/core/vendor/src/crfsuite/config.guess +0 -1502
  20. data/core/vendor/src/crfsuite/config.h.in +0 -198
  21. data/core/vendor/src/crfsuite/config.sub +0 -1714
  22. data/core/vendor/src/crfsuite/configure +0 -14273
  23. data/core/vendor/src/crfsuite/configure.in +0 -149
  24. data/core/vendor/src/crfsuite/crfsuite.sln +0 -42
  25. data/core/vendor/src/crfsuite/depcomp +0 -630
  26. data/core/vendor/src/crfsuite/example/chunking.py +0 -49
  27. data/core/vendor/src/crfsuite/example/crfutils.py +0 -179
  28. data/core/vendor/src/crfsuite/example/ner.py +0 -270
  29. data/core/vendor/src/crfsuite/example/pos.py +0 -78
  30. data/core/vendor/src/crfsuite/example/template.py +0 -88
  31. data/core/vendor/src/crfsuite/frontend/Makefile.am +0 -29
  32. data/core/vendor/src/crfsuite/frontend/Makefile.in +0 -640
  33. data/core/vendor/src/crfsuite/frontend/dump.c +0 -116
  34. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +0 -129
  35. data/core/vendor/src/crfsuite/frontend/iwa.c +0 -273
  36. data/core/vendor/src/crfsuite/frontend/iwa.h +0 -65
  37. data/core/vendor/src/crfsuite/frontend/learn.c +0 -439
  38. data/core/vendor/src/crfsuite/frontend/main.c +0 -137
  39. data/core/vendor/src/crfsuite/frontend/option.c +0 -93
  40. data/core/vendor/src/crfsuite/frontend/option.h +0 -86
  41. data/core/vendor/src/crfsuite/frontend/readdata.h +0 -38
  42. data/core/vendor/src/crfsuite/frontend/reader.c +0 -136
  43. data/core/vendor/src/crfsuite/frontend/tag.c +0 -427
  44. data/core/vendor/src/crfsuite/genbinary.sh.in +0 -15
  45. data/core/vendor/src/crfsuite/include/Makefile.am +0 -11
  46. data/core/vendor/src/crfsuite/include/Makefile.in +0 -461
  47. data/core/vendor/src/crfsuite/include/crfsuite.h +0 -1063
  48. data/core/vendor/src/crfsuite/include/crfsuite.hpp +0 -555
  49. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +0 -400
  50. data/core/vendor/src/crfsuite/include/os.h +0 -61
  51. data/core/vendor/src/crfsuite/install-sh +0 -520
  52. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +0 -28
  53. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +0 -21
  54. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +0 -549
  55. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +0 -86
  56. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +0 -524
  57. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +0 -587
  58. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +0 -976
  59. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +0 -46
  60. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +0 -721
  61. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +0 -216
  62. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +0 -353
  63. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +0 -705
  64. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +0 -943
  65. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +0 -352
  66. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +0 -994
  67. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +0 -550
  68. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +0 -492
  69. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +0 -236
  70. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +0 -272
  71. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +0 -106
  72. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +0 -118
  73. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +0 -80
  74. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +0 -91
  75. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +0 -48
  76. data/core/vendor/src/crfsuite/lib/crf/src/params.c +0 -335
  77. data/core/vendor/src/crfsuite/lib/crf/src/params.h +0 -80
  78. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +0 -172
  79. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +0 -46
  80. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +0 -1107
  81. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +0 -160
  82. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +0 -408
  83. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +0 -242
  84. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +0 -507
  85. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +0 -338
  86. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +0 -435
  87. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +0 -341
  88. data/core/vendor/src/crfsuite/ltmain.sh +0 -8413
  89. data/core/vendor/src/crfsuite/missing +0 -376
  90. data/core/vendor/src/crfsuite/swig/Makefile.am +0 -13
  91. data/core/vendor/src/crfsuite/swig/Makefile.in +0 -365
  92. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +0 -2
  93. data/core/vendor/src/crfsuite/swig/export.i +0 -32
  94. data/core/vendor/src/crfsuite/swig/python/README +0 -92
  95. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +0 -329
  96. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +0 -14355
  97. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +0 -63
  98. data/core/vendor/src/crfsuite/swig/python/prepare.sh +0 -9
  99. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +0 -52
  100. data/core/vendor/src/crfsuite/swig/python/sample_train.py +0 -68
  101. data/core/vendor/src/crfsuite/swig/python/setup.py +0 -44
  102. data/core/vendor/src/crfsuite/win32/stdint.h +0 -679
  103. data/core/vendor/src/liblbfgs/AUTHORS +0 -1
  104. data/core/vendor/src/liblbfgs/COPYING +0 -22
  105. data/core/vendor/src/liblbfgs/ChangeLog +0 -120
  106. data/core/vendor/src/liblbfgs/INSTALL +0 -231
  107. data/core/vendor/src/liblbfgs/Makefile.am +0 -10
  108. data/core/vendor/src/liblbfgs/Makefile.in +0 -638
  109. data/core/vendor/src/liblbfgs/NEWS +0 -0
  110. data/core/vendor/src/liblbfgs/README +0 -71
  111. data/core/vendor/src/liblbfgs/aclocal.m4 +0 -6985
  112. data/core/vendor/src/liblbfgs/autogen.sh +0 -38
  113. data/core/vendor/src/liblbfgs/config.guess +0 -1411
  114. data/core/vendor/src/liblbfgs/config.h.in +0 -64
  115. data/core/vendor/src/liblbfgs/config.sub +0 -1500
  116. data/core/vendor/src/liblbfgs/configure +0 -21146
  117. data/core/vendor/src/liblbfgs/configure.in +0 -107
  118. data/core/vendor/src/liblbfgs/depcomp +0 -522
  119. data/core/vendor/src/liblbfgs/include/lbfgs.h +0 -745
  120. data/core/vendor/src/liblbfgs/install-sh +0 -322
  121. data/core/vendor/src/liblbfgs/lbfgs.sln +0 -26
  122. data/core/vendor/src/liblbfgs/lib/Makefile.am +0 -24
  123. data/core/vendor/src/liblbfgs/lib/Makefile.in +0 -499
  124. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +0 -133
  125. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +0 -294
  126. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +0 -298
  127. data/core/vendor/src/liblbfgs/lib/lbfgs.c +0 -1371
  128. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +0 -95
  129. data/core/vendor/src/liblbfgs/ltmain.sh +0 -6426
  130. data/core/vendor/src/liblbfgs/missing +0 -353
  131. data/core/vendor/src/liblbfgs/sample/Makefile.am +0 -15
  132. data/core/vendor/src/liblbfgs/sample/Makefile.in +0 -433
  133. data/core/vendor/src/liblbfgs/sample/sample.c +0 -81
  134. data/core/vendor/src/liblbfgs/sample/sample.cpp +0 -126
  135. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +0 -105
  136. data/core/vendor/src/svm_light/LICENSE.txt +0 -59
  137. data/core/vendor/src/svm_light/Makefile +0 -105
  138. data/core/vendor/src/svm_light/kernel.h +0 -40
  139. data/core/vendor/src/svm_light/svm_classify.c +0 -197
  140. data/core/vendor/src/svm_light/svm_common.c +0 -985
  141. data/core/vendor/src/svm_light/svm_common.h +0 -301
  142. data/core/vendor/src/svm_light/svm_hideo.c +0 -1062
  143. data/core/vendor/src/svm_light/svm_learn.c +0 -4147
  144. data/core/vendor/src/svm_light/svm_learn.h +0 -169
  145. data/core/vendor/src/svm_light/svm_learn_main.c +0 -397
  146. data/core/vendor/src/svm_light/svm_loqo.c +0 -211
  147. data/task/c.rake +0 -36
  148. data/task/submodules.rake +0 -5
@@ -1,179 +0,0 @@
1
- """
2
- A miscellaneous utility for sequential labeling.
3
- Copyright 2010,2011 Naoaki Okazaki.
4
- """
5
-
6
- import optparse
7
- import sys
8
-
9
- def apply_templates(X, templates):
10
- """
11
- Generate features for an item sequence by applying feature templates.
12
- A feature template consists of a tuple of (name, offset) pairs,
13
- where name and offset specify a field name and offset from which
14
- the template extracts a feature value. Generated features are stored
15
- in the 'F' field of each item in the sequence.
16
-
17
- @type X: list of mapping objects
18
- @param X: The item sequence.
19
- @type template: tuple of (str, int)
20
- @param template: The feature template.
21
- """
22
- for template in templates:
23
- name = '|'.join(['%s[%d]' % (f, o) for f, o in template])
24
- for t in range(len(X)):
25
- values = []
26
- for field, offset in template:
27
- p = t + offset
28
- if p not in range(len(X)):
29
- values = []
30
- break
31
- values.append(X[p][field])
32
- if values:
33
- X[t]['F'].append('%s=%s' % (name, '|'.join(values)))
34
-
35
- def readiter(fi, names, sep=' '):
36
- """
37
- Return an iterator for item sequences read from a file object.
38
- This function reads a sequence from a file object L{fi}, and
39
- yields the sequence as a list of mapping objects. Each line
40
- (item) from the file object is split by the separator character
41
- L{sep}. Separated values of the item are named by L{names},
42
- and stored in a mapping object. Every item has a field 'F' that
43
- is reserved for storing features.
44
-
45
- @type fi: file
46
- @param fi: The file object.
47
- @type names: tuple
48
- @param names: The list of field names.
49
- @type sep: str
50
- @param sep: The separator character.
51
- @rtype list of mapping objects
52
- @return An iterator for sequences.
53
- """
54
- X = []
55
- for line in fi:
56
- line = line.strip('\n')
57
- if not line:
58
- yield X
59
- X = []
60
- else:
61
- fields = line.split(sep)
62
- if len(fields) < len(names):
63
- raise ValueError(
64
- 'Too few fields (%d) for %r\n%s' % (len(fields), names, line))
65
- item = {'F': []} # 'F' is reserved for features.
66
- for i in range(len(names)):
67
- item[names[i]] = fields[i]
68
- X.append(item)
69
-
70
- def escape(src):
71
- """
72
- Escape colon characters from feature names.
73
-
74
- @type src: str
75
- @param src: A feature name
76
- @rtype str
77
- @return The feature name escaped.
78
- """
79
- return src.replace(':', '__COLON__')
80
-
81
- def output_features(fo, X, field=''):
82
- """
83
- Output features (and reference labels) of a sequence in CRFSuite
84
- format. For each item in the sequence, this function writes a
85
- reference label (if L{field} is a non-empty string) and features.
86
-
87
- @type fo: file
88
- @param fo: The file object.
89
- @type X: list of mapping objects
90
- @param X: The sequence.
91
- @type field: str
92
- @param field: The field name of reference labels.
93
- """
94
- for t in range(len(X)):
95
- if field:
96
- fo.write('%s' % X[t][field])
97
- for a in X[t]['F']:
98
- if isinstance(a, str):
99
- fo.write('\t%s' % escape(a))
100
- else:
101
- fo.write('\t%s:%f' % (escape(a[0]), a[1]))
102
- fo.write('\n')
103
- fo.write('\n')
104
-
105
- def to_crfsuite(X):
106
- """
107
- Convert an item sequence into an object compatible with crfsuite
108
- Python module.
109
-
110
- @type X: list of mapping objects
111
- @param X: The sequence.
112
- @rtype crfsuite.ItemSequence
113
- @return The same sequence in crfsuite.ItemSequence type.
114
- """
115
- import crfsuite
116
- xseq = crfsuite.ItemSequence()
117
- for x in X:
118
- item = crfsuite.Item()
119
- for f in x['F']:
120
- if isinstance(f, str):
121
- item.append(crfsuite.Attribute(escape(f)))
122
- else:
123
- item.append(crfsuite.Attribute(escape(f[0]), f[1]))
124
- xseq.append(item)
125
- return xseq
126
-
127
- def main(feature_extractor, fields='w pos y', sep=' '):
128
- fi = sys.stdin
129
- fo = sys.stdout
130
-
131
- # Parse the command-line arguments.
132
- parser = optparse.OptionParser(usage="""usage: %prog [options]
133
- This utility reads a data set from STDIN, and outputs attributes to STDOUT.
134
- Each line of a data set must consist of field values separated by SEPARATOR
135
- characters. The names and order of field values can be specified by -f option.
136
- The separator character can be specified with -s option. Instead of outputting
137
- attributes, this utility tags the input data when a model file is specified by
138
- -t option (CRFsuite Python module must be installed)."""
139
- )
140
- parser.add_option(
141
- '-t', dest='model',
142
- help='tag the input using the model (requires "crfsuite" module)'
143
- )
144
- parser.add_option(
145
- '-f', dest='fields', default=fields,
146
- help='specify field names of input data [default: "%default"]'
147
- )
148
- parser.add_option(
149
- '-s', dest='separator', default=sep,
150
- help='specify the separator of columns of input data [default: "%default"]'
151
- )
152
- (options, args) = parser.parse_args()
153
-
154
- # The fields of input: ('w', 'pos', 'y) by default.
155
- F = options.fields.split(' ')
156
-
157
- if not options.model:
158
- # The generator function readiter() reads a sequence from a
159
- for X in readiter(fi, F, options.separator):
160
- feature_extractor(X)
161
- output_features(fo, X, 'y')
162
-
163
- else:
164
- # Create a tagger with an existing model.
165
- import crfsuite
166
- tagger = crfsuite.Tagger()
167
- tagger.open(options.model)
168
-
169
- # For each sequence from STDIN.
170
- for X in readiter(fi, F, options.separator):
171
- # Obtain features.
172
- feature_extractor(X)
173
- xseq = to_crfsuite(X)
174
- yseq = tagger.tag(xseq)
175
- for t in range(len(X)):
176
- v = X[t]
177
- fo.write('\t'.join([v[f] for f in F]))
178
- fo.write('\t%s\n' % yseq[t])
179
- fo.write('\n')
@@ -1,270 +0,0 @@
1
- #!/usr/bin/env python
2
-
3
- """
4
- A feature extractor for named eneity recognition (NER).
5
- Copyright 2010,2011 Naoaki Okazaki.
6
- """
7
-
8
- # Separator of field values.
9
- separator = ' '
10
-
11
- # Field names of the input data.
12
- fields = 'y w pos chk'
13
-
14
-
15
- import crfutils
16
-
17
- def get_shape(token):
18
- r = ''
19
- for c in token:
20
- if c.isupper():
21
- r += 'U'
22
- elif c.islower():
23
- r += 'L'
24
- elif c.isdigit():
25
- r += 'D'
26
- elif c in ('.', ','):
27
- r += '.'
28
- elif c in (';', ':', '?', '!'):
29
- r += ';'
30
- elif c in ('+', '-', '*', '/', '=', '|', '_'):
31
- r += '-'
32
- elif c in ('(', '{', '[', '<'):
33
- r += '('
34
- elif c in (')', '}', ']', '>'):
35
- r += ')'
36
- else:
37
- r += c
38
- return r
39
-
40
- def degenerate(src):
41
- dst = ''
42
- for c in src:
43
- if not dst or dst[-1] != c:
44
- dst += c
45
- return dst
46
-
47
- def get_type(token):
48
- T = (
49
- 'AllUpper', 'AllDigit', 'AllSymbol',
50
- 'AllUpperDigit', 'AllUpperSymbol', 'AllDigitSymbol',
51
- 'AllUpperDigitSymbol',
52
- 'InitUpper',
53
- 'AllLetter',
54
- 'AllAlnum',
55
- )
56
- R = set(T)
57
- if not token:
58
- return 'EMPTY'
59
-
60
- for i in range(len(token)):
61
- c = token[i]
62
- if c.isupper():
63
- R.discard('AllDigit')
64
- R.discard('AllSymbol')
65
- R.discard('AllDigitSymbol')
66
- elif c.isdigit() or c in (',', '.'):
67
- R.discard('AllUpper')
68
- R.discard('AllSymbol')
69
- R.discard('AllUpperSymbol')
70
- R.discard('AllLetter')
71
- elif c.islower():
72
- R.discard('AllUpper')
73
- R.discard('AllDigit')
74
- R.discard('AllSymbol')
75
- R.discard('AllUpperDigit')
76
- R.discard('AllUpperSymbol')
77
- R.discard('AllDigitSymbol')
78
- R.discard('AllUpperDigitSymbol')
79
- else:
80
- R.discard('AllUpper')
81
- R.discard('AllDigit')
82
- R.discard('AllUpperDigit')
83
- R.discard('AllLetter')
84
- R.discard('AllAlnum')
85
-
86
- if i == 0 and not c.isupper():
87
- R.discard('InitUpper')
88
-
89
- for tag in T:
90
- if tag in R:
91
- return tag
92
- return 'NO'
93
-
94
- def get_2d(token):
95
- return len(token) == 2 and token.isdigit()
96
-
97
- def get_4d(token):
98
- return len(token) == 4 and token.isdigit()
99
-
100
- def get_da(token):
101
- bd = False
102
- ba = False
103
- for c in token:
104
- if c.isdigit():
105
- bd = True
106
- elif c.isalpha():
107
- ba = True
108
- else:
109
- return False
110
- return bd and ba
111
-
112
- def get_dand(token, p):
113
- bd = False
114
- bdd = False
115
- for c in token:
116
- if c.isdigit():
117
- bd = True
118
- elif c == p:
119
- bdd = True
120
- else:
121
- return False
122
- return bd and bdd
123
-
124
- def get_all_other(token):
125
- for c in token:
126
- if c.isalnum():
127
- return False
128
- return True
129
-
130
- def get_capperiod(token):
131
- return len(token) == 2 and token[0].isupper() and token[1] == '.'
132
-
133
- def contains_upper(token):
134
- b = False
135
- for c in token:
136
- b |= c.isupper()
137
- return b
138
-
139
- def contains_lower(token):
140
- b = False
141
- for c in token:
142
- b |= c.islower()
143
- return b
144
-
145
- def contains_alpha(token):
146
- b = False
147
- for c in token:
148
- b |= c.isalpha()
149
- return b
150
-
151
- def contains_digit(token):
152
- b = False
153
- for c in token:
154
- b |= c.isdigit()
155
- return b
156
-
157
- def contains_symbol(token):
158
- b = False
159
- for c in token:
160
- b |= ~c.isalnum()
161
- return b
162
-
163
- def b(v):
164
- return 'yes' if v else 'no'
165
-
166
- def observation(v, defval=''):
167
- # Lowercased token.
168
- v['wl'] = v['w'].lower()
169
- # Token shape.
170
- v['shape'] = get_shape(v['w'])
171
- # Token shape degenerated.
172
- v['shaped'] = degenerate(v['shape'])
173
- # Token type.
174
- v['type'] = get_type(v['w'])
175
-
176
- # Prefixes (length between one to four).
177
- v['p1'] = v['w'][0] if len(v['w']) >= 1 else defval
178
- v['p2'] = v['w'][:2] if len(v['w']) >= 2 else defval
179
- v['p3'] = v['w'][:3] if len(v['w']) >= 3 else defval
180
- v['p4'] = v['w'][:4] if len(v['w']) >= 4 else defval
181
-
182
- # Suffixes (length between one to four).
183
- v['s1'] = v['w'][-1] if len(v['w']) >= 1 else defval
184
- v['s2'] = v['w'][-2:] if len(v['w']) >= 2 else defval
185
- v['s3'] = v['w'][-3:] if len(v['w']) >= 3 else defval
186
- v['s4'] = v['w'][-4:] if len(v['w']) >= 4 else defval
187
-
188
- # Two digits
189
- v['2d'] = b(get_2d(v['w']))
190
- # Four digits.
191
- v['4d'] = b(get_4d(v['w']))
192
- # Alphanumeric token.
193
- v['d&a'] = b(get_da(v['w']))
194
- # Digits and '-'.
195
- v['d&-'] = b(get_dand(v['w'], '-'))
196
- # Digits and '/'.
197
- v['d&/'] = b(get_dand(v['w'], '/'))
198
- # Digits and ','.
199
- v['d&,'] = b(get_dand(v['w'], ','))
200
- # Digits and '.'.
201
- v['d&.'] = b(get_dand(v['w'], '.'))
202
- # A uppercase letter followed by '.'
203
- v['up'] = b(get_capperiod(v['w']))
204
-
205
- # An initial uppercase letter.
206
- v['iu'] = b(v['w'] and v['w'][0].isupper())
207
- # All uppercase letters.
208
- v['au'] = b(v['w'].isupper())
209
- # All lowercase letters.
210
- v['al'] = b(v['w'].islower())
211
- # All digit letters.
212
- v['ad'] = b(v['w'].isdigit())
213
- # All other (non-alphanumeric) letters.
214
- v['ao'] = b(get_all_other(v['w']))
215
-
216
- # Contains a uppercase letter.
217
- v['cu'] = b(contains_upper(v['w']))
218
- # Contains a lowercase letter.
219
- v['cl'] = b(contains_lower(v['w']))
220
- # Contains a alphabet letter.
221
- v['ca'] = b(contains_alpha(v['w']))
222
- # Contains a digit.
223
- v['cd'] = b(contains_digit(v['w']))
224
- # Contains a symbol.
225
- v['cs'] = b(contains_symbol(v['w']))
226
-
227
- def disjunctive(X, t, field, begin, end):
228
- name = '%s[%d..%d]' % (field, begin, end)
229
- for offset in range(begin, end+1):
230
- p = t + offset
231
- if p not in range(0, len(X)):
232
- continue
233
- X[t]['F'].append('%s=%s' % (name, X[p][field]))
234
-
235
- U = [
236
- 'w', 'wl', 'pos', 'chk', 'shape', 'shaped', 'type',
237
- 'p1', 'p2', 'p3', 'p4',
238
- 's1', 's2', 's3', 's4',
239
- '2d', '4d', 'd&a', 'd&-', 'd&/', 'd&,', 'd&.', 'up',
240
- 'iu', 'au', 'al', 'ad', 'ao',
241
- 'cu', 'cl', 'ca', 'cd', 'cs',
242
- ]
243
- B = ['w', 'pos', 'chk', 'shaped', 'type']
244
-
245
- templates = []
246
- for name in U:
247
- templates += [((name, i),) for i in range(-2, 3)]
248
- for name in B:
249
- templates += [((name, i), (name, i+1)) for i in range(-2, 2)]
250
-
251
- def feature_extractor(X):
252
- # Append observations.
253
- for x in X:
254
- observation(x)
255
-
256
- # Apply the feature templates.
257
- crfutils.apply_templates(X, templates)
258
-
259
- # Append disjunctive features.
260
- for t in range(len(X)):
261
- disjunctive(X, t, 'w', -4, -1)
262
- disjunctive(X, t, 'w', 1, 4)
263
-
264
- # Append BOS and EOS features.
265
- if X:
266
- X[0]['F'].append('__BOS__')
267
- X[-1]['F'].append('__EOS__')
268
-
269
- if __name__ == '__main__':
270
- crfutils.main(feature_extractor, fields=fields, sep=separator)
@@ -1,78 +0,0 @@
1
- #!/usr/bin/env python
2
-
3
- """
4
- An example for part-of-speech tagging.
5
- Copyright 2010,2011 Naoaki Okazaki.
6
- """
7
-
8
- # Separator of field values.
9
- separator = ' '
10
-
11
- # Field names of the input data.
12
- fields = 'w num cap sym p1 p2 p3 p4 s1 s2 s3 s4 y'
13
-
14
- # Feature template. This template is identical to the one bundled in CRF++
15
- # distribution, but written in a Python object.
16
- templates = (
17
- (('num', 0), ),
18
- (('cap', 0), ),
19
- (('sym', 0), ),
20
- (('p1', 0), ),
21
- (('p2', 0), ),
22
- (('p3', 0), ),
23
- (('p4', 0), ),
24
- (('s1', 0), ),
25
- (('s2', 0), ),
26
- (('s3', 0), ),
27
- (('s4', 0), ),
28
-
29
- (('w', 0), ),
30
- (('w', -1), ),
31
- (('w', 1), ),
32
- (('w', -2), ),
33
- (('w', 2), ),
34
- (('w', -2), ('w', -1)),
35
- (('w', -1), ('w', 0)),
36
- (('w', 0), ('w', 1)),
37
- (('w', 1), ('w', 2)),
38
- (('w', -2), ('w', -1), ('w', 0)),
39
- (('w', -1), ('w', 0), ('w', 1)),
40
- (('w', 0), ('w', 1), ('w', 2)),
41
- (('w', -2), ('w', -1), ('w', 0), ('w', 1)),
42
- (('w', -1), ('w', 0), ('w', 1), ('w', 2)),
43
- (('w', -2), ('w', -1), ('w', 0), ('w', 1), ('w', 2)),
44
-
45
- (('w', 0), ('w', -1)),
46
- (('w', 0), ('w', -2)),
47
- (('w', 0), ('w', -3)),
48
- (('w', 0), ('w', -4)),
49
- (('w', 0), ('w', -5)),
50
- (('w', 0), ('w', -6)),
51
- (('w', 0), ('w', -7)),
52
- (('w', 0), ('w', -8)),
53
- (('w', 0), ('w', -9)),
54
-
55
- (('w', 0), ('w', 1)),
56
- (('w', 0), ('w', 2)),
57
- (('w', 0), ('w', 3)),
58
- (('w', 0), ('w', 4)),
59
- (('w', 0), ('w', 5)),
60
- (('w', 0), ('w', 6)),
61
- (('w', 0), ('w', 7)),
62
- (('w', 0), ('w', 8)),
63
- (('w', 0), ('w', 9)),
64
- )
65
-
66
-
67
- import crfutils
68
-
69
- def feature_extractor(X):
70
- # Apply feature templates to obtain features (in fact, attributes)
71
- crfutils.apply_templates(X, templates)
72
- if X:
73
- # Append BOS and EOS features manually
74
- X[0]['F'].append('__BOS__') # BOS feature
75
- X[-1]['F'].append('__EOS__') # EOS feature
76
-
77
- if __name__ == '__main__':
78
- crfutils.main(feature_extractor, fields=fields, sep=separator)
@@ -1,88 +0,0 @@
1
- #!/usr/bin/env python
2
-
3
- import re
4
- import sys
5
-
6
- class FeatureExtractor:
7
- def __init__(self):
8
- self.macro = re.compile(r'%x\[(?P<row>[\d-]+),(?P<col>[\d]+)\]')
9
- self.inst = []
10
- self.t = 0
11
- self.templates = []
12
-
13
- def read(self, fi):
14
- self.templates = []
15
- for line in fi:
16
- line = line.strip()
17
- if line.startswith('#'):
18
- continue
19
- if line.startswith('U'):
20
- self.templates.append(line.replace(':', '='))
21
- elif line == 'B':
22
- continue
23
- elif line.startswith('B'):
24
- sys.stderr(
25
- 'ERROR: bigram templates not supported: %s\n' % line)
26
- sys.exit(1)
27
-
28
- def replace(self, m):
29
- row = self.t + int(m.group('row'))
30
- col = int(m.group('col'))
31
- if row in range(0, len(self.inst)):
32
- return self.inst[row]['x'][col]
33
- else:
34
- return ''
35
-
36
- def apply(self, inst, t):
37
- self.inst = inst
38
- self.t = t
39
- for template in self.templates:
40
- f = re.sub(self.macro, self.replace, template)
41
- self.inst[t]['F'].append(f)
42
-
43
- def readiter(fi, sep=None):
44
- X = []
45
- for line in fi:
46
- line = line.strip('\n')
47
- if not line:
48
- yield X
49
- X = []
50
- else:
51
- fields = line.split(sep)
52
- item = {
53
- 'x': fields[0:-1],
54
- 'y': fields[-1],
55
- 'F': []
56
- }
57
- X.append(item)
58
-
59
- if __name__ == '__main__':
60
- import optparse
61
-
62
- fi = sys.stdin
63
- fo = sys.stdout
64
-
65
- # Parse the command-line arguments.
66
- parser = optparse.OptionParser(usage="""usage: %prog <template>
67
- This utility reads a data set from STDIN, applies feature templates compatible
68
- with CRF++, and outputs attributes to STDOUT. Each line of a data set must
69
- consist of field values separated by SEPARATOR characters (customizable with
70
- -s option)."""
71
- )
72
- parser.add_option(
73
- '-s', dest='separator', default='\t',
74
- help='specify the separator of columns of input data [default: "\\t"]'
75
- )
76
- (options, args) = parser.parse_args()
77
-
78
- F = FeatureExtractor()
79
- F.read(open(args[0]))
80
-
81
- for inst in readiter(fi, options.separator):
82
- for t in range(len(inst)):
83
- F.apply(inst, t)
84
- fo.write('%s' % inst[t]['y'])
85
- for attr in inst[t]['F']:
86
- fo.write('\t%s' % attr.replace(':', '__COLON__'))
87
- fo.write('\n')
88
- fo.write('\n')
@@ -1,29 +0,0 @@
1
- # $Id:$
2
-
3
- bin_PROGRAMS = crfsuite
4
- #man_MANS = crfsuite.1
5
- #EXTRA_DIST = ${man_MANS}
6
-
7
- EXTRA_DIST = \
8
- frontend.vcxproj
9
-
10
- crfsuite_SOURCES = \
11
- iwa.h \
12
- iwa.c \
13
- option.h \
14
- option.c \
15
- readdata.h \
16
- reader.c \
17
- learn.c \
18
- tag.c \
19
- dump.c \
20
- main.c
21
-
22
- #crfsuite_CPPFLAGS =
23
-
24
- AM_CFLAGS = @CFLAGS@
25
- INCLUDES = @INCLUDES@
26
- AM_LDFLAGS = @LDFLAGS@
27
-
28
- crfsuite_CFLAGS = -I$(top_builddir)/include
29
- crfsuite_LDADD = $(top_builddir)/lib/crf/libcrfsuite.la