feedbackmine-tokenizer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Manifest.txt ADDED
@@ -0,0 +1,13 @@
1
+ ext/tokenizer.c
2
+ ext/scanner.rl
3
+ ext/url.rl
4
+ ext/extconf.rb
5
+ ext/scanner.c
6
+ ext/email.rl
7
+ ext/scanner.in
8
+ Rakefile
9
+ Manifest.txt
10
+ README.txt
11
+ test
12
+ test/test.rb
13
+
data/Rakefile ADDED
@@ -0,0 +1,25 @@
1
+ require 'rubygems'
2
+ require 'hoe'
3
+
4
+ EXT = "ext/tokenizer.#{Hoe::DLEXT}"
5
+
6
+ Hoe.new('tokenizer', '0.1.0') do |p|
7
+ p.author = 'FeedbackMine'
8
+ p.email = 'feedbackmine@feedbackmine.com'
9
+ p.url = 'http://www.tweetjobsearch.com'
10
+ p.summary = 'tokenizer'
11
+ p.description = 'tokenizer'
12
+
13
+ p.spec_extras[:extensions] = "ext/extconf.rb"
14
+ p.clean_globs << EXT << "ext/*.o" << "ext/Makefile"
15
+ end
16
+
17
+ task :test => EXT
18
+
19
+ file EXT => ["ext/extconf.rb", "ext/tokenizer.c"] do
20
+ Dir.chdir "ext" do
21
+ sh "ragel scanner.rl -o scanner.c"
22
+ ruby "extconf.rb"
23
+ sh "make"
24
+ end
25
+ end
data/ext/email.rl ADDED
@@ -0,0 +1,21 @@
1
+ #// email.rl -*-C-*-
2
+ %%{
3
+ machine Email;
4
+
5
+ #// RFC 2822 - matching email addresses
6
+ NO_WS_CTL = ( 1..8 | 11 | 12 | 14..31 | 127 );
7
+ ASCII = 1..127;
8
+ atext = [a-zA-Z0-9!#$%&\'*+\-/=?^_`{|}~];
9
+ qtext = ( NO_WS_CTL | 33 | 35..91 | 93..126 );
10
+ dtext = ( NO_WS_CTL | 33..90 | 94..126 );
11
+ dot_atom = atext+ ('.' atext+)*;
12
+ text = ( 1..9 | 11 | 12 | 14..127 );
13
+ quoted_pair = '\\' text;
14
+ quoted_string = '"' ( qtext | quoted_pair )* '"';
15
+ domain_literal = '[' (dtext | quoted_pair)* ']';
16
+
17
+ local_part = dot_atom | quoted_string;
18
+ domain = dot_atom | domain_literal;
19
+
20
+ email = local_part '@' domain;
21
+ }%%
data/ext/extconf.rb ADDED
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+ $CFLAGS = "#{ENV['CFLAGS']} -Wall -O3 "
3
+ create_makefile('tokenizer')
data/ext/scanner.c ADDED
@@ -0,0 +1,976 @@
1
+ #line 1 "scanner.rl"
2
+ /* scanner.rl -*-C-*- */
3
+ #include <ctype.h>
4
+ #include <stdio.h>
5
+ #include <stdlib.h>
6
+ #include <string.h>
7
+ #include <unistd.h>
8
+
9
+ #define RET goto ret;
10
+
11
+ #define STRIP(c) do { \
12
+ strip_char = c; \
13
+ goto ret; \
14
+ } while(0)
15
+
16
+ #line 26 "scanner.rl"
17
+
18
+
19
+
20
+ #line 21 "scanner.c"
21
+ static const char _StdTok_actions[] = {
22
+ 0, 1, 0, 1, 1, 1, 2, 1,
23
+ 3, 1, 4, 1, 5, 1, 21, 1,
24
+ 23, 1, 24, 1, 25, 1, 26, 1,
25
+ 27, 1, 28, 1, 29, 1, 30, 1,
26
+ 31, 1, 32, 1, 33, 1, 34, 1,
27
+ 35, 1, 36, 1, 37, 1, 38, 1,
28
+ 39, 1, 40, 1, 41, 1, 42, 1,
29
+ 43, 1, 44, 1, 45, 1, 46, 2,
30
+ 1, 22, 2, 5, 6, 2, 5, 7,
31
+ 2, 5, 8, 2, 5, 9, 2, 5,
32
+ 10, 2, 5, 11, 2, 5, 12, 2,
33
+ 5, 13, 2, 5, 14, 2, 5, 15,
34
+ 2, 5, 16, 2, 5, 17, 2, 5,
35
+ 18, 2, 5, 19, 2, 5, 20, 3,
36
+ 5, 1, 16
37
+ };
38
+
39
+ static const short _StdTok_key_offsets[] = {
40
+ 0, 0, 0, 14, 28, 43, 57, 69,
41
+ 75, 85, 86, 92, 107, 128, 149, 170,
42
+ 191, 218, 239, 241, 263, 285, 307, 329,
43
+ 351, 373, 395, 417, 439, 460, 487, 488,
44
+ 495, 497, 524, 525, 538, 538, 563, 577,
45
+ 591, 601, 615, 631, 647, 663, 686, 707,
46
+ 729, 732, 755, 778, 801, 824, 848, 871,
47
+ 894, 917, 939, 961, 982, 990, 1013, 1016,
48
+ 1017, 1018, 1033, 1039, 1064, 1083, 1104, 1125,
49
+ 1144, 1165, 1187, 1209, 1231, 1253, 1275, 1297,
50
+ 1319, 1341, 1366, 1382, 1401, 1421, 1446, 1473,
51
+ 1499, 1525, 1550, 1563, 1589, 1615, 1641, 1667
52
+ };
53
+
54
+ static const unsigned char _StdTok_trans_keys[] = {
55
+ 33u, 46u, 61u, 64u, 35u, 39u, 42u, 43u,
56
+ 45u, 57u, 63u, 90u, 94u, 126u, 33u, 45u,
57
+ 61u, 63u, 35u, 39u, 42u, 43u, 47u, 57u,
58
+ 65u, 90u, 94u, 126u, 33u, 45u, 61u, 63u,
59
+ 91u, 35u, 39u, 42u, 43u, 47u, 57u, 65u,
60
+ 90u, 94u, 126u, 33u, 45u, 61u, 63u, 35u,
61
+ 39u, 42u, 43u, 47u, 57u, 65u, 90u, 94u,
62
+ 126u, 92u, 93u, 1u, 8u, 11u, 12u, 14u,
63
+ 31u, 33u, 90u, 94u, 127u, 1u, 9u, 11u,
64
+ 12u, 14u, 127u, 34u, 92u, 1u, 8u, 11u,
65
+ 12u, 14u, 31u, 33u, 127u, 64u, 1u, 9u,
66
+ 11u, 12u, 14u, 127u, 33u, 45u, 47u, 61u,
67
+ 63u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
68
+ 90u, 94u, 126u, 33u, 42u, 43u, 46u, 61u,
69
+ 63u, 64u, 35u, 39u, 45u, 47u, 48u, 57u,
70
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
71
+ 33u, 45u, 46u, 47u, 61u, 63u, 64u, 35u,
72
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
73
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
74
+ 47u, 61u, 63u, 64u, 35u, 39u, 42u, 43u,
75
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
76
+ 123u, 126u, 33u, 45u, 46u, 47u, 61u, 63u,
77
+ 64u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
78
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
79
+ 45u, 47u, 61u, 63u, 98u, 99u, 101u, 103u,
80
+ 105u, 109u, 110u, 111u, 35u, 39u, 42u, 43u,
81
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
82
+ 123u, 126u, 33u, 45u, 46u, 47u, 61u, 63u,
83
+ 64u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
84
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 48u,
85
+ 57u, 33u, 45u, 46u, 47u, 61u, 63u, 64u,
86
+ 105u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
87
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
88
+ 45u, 46u, 47u, 61u, 63u, 64u, 111u, 35u,
89
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
90
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
91
+ 47u, 61u, 63u, 64u, 100u, 35u, 39u, 42u,
92
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
93
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 61u,
94
+ 63u, 64u, 111u, 35u, 39u, 42u, 43u, 48u,
95
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
96
+ 126u, 33u, 45u, 46u, 47u, 61u, 63u, 64u,
97
+ 110u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
98
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
99
+ 45u, 46u, 47u, 61u, 63u, 64u, 111u, 35u,
100
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
101
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
102
+ 47u, 61u, 63u, 64u, 105u, 35u, 39u, 42u,
103
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
104
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 61u,
105
+ 63u, 64u, 101u, 35u, 39u, 42u, 43u, 48u,
106
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
107
+ 126u, 33u, 45u, 46u, 47u, 61u, 63u, 64u,
108
+ 114u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
109
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
110
+ 46u, 61u, 63u, 64u, 35u, 39u, 42u, 43u,
111
+ 45u, 47u, 48u, 57u, 65u, 90u, 94u, 96u,
112
+ 97u, 122u, 123u, 126u, 33u, 45u, 47u, 61u,
113
+ 63u, 98u, 99u, 101u, 103u, 105u, 109u, 110u,
114
+ 111u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
115
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 47u,
116
+ 95u, 44u, 58u, 64u, 90u, 97u, 122u, 40u,
117
+ 41u, 33u, 45u, 47u, 61u, 63u, 98u, 99u,
118
+ 101u, 103u, 105u, 109u, 110u, 111u, 35u, 39u,
119
+ 42u, 43u, 48u, 57u, 65u, 90u, 94u, 96u,
120
+ 97u, 122u, 123u, 126u, 47u, 45u, 47u, 58u,
121
+ 64u, 95u, 44u, 46u, 48u, 57u, 65u, 90u,
122
+ 97u, 122u, 0u, 33u, 34u, 42u, 43u, 45u,
123
+ 47u, 58u, 61u, 63u, 64u, 102u, 104u, 35u,
124
+ 39u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
125
+ 122u, 123u, 126u, 33u, 46u, 61u, 64u, 35u,
126
+ 39u, 42u, 43u, 45u, 57u, 63u, 90u, 94u,
127
+ 126u, 33u, 46u, 61u, 63u, 35u, 39u, 42u,
128
+ 43u, 45u, 57u, 65u, 90u, 94u, 126u, 34u,
129
+ 92u, 1u, 8u, 11u, 12u, 14u, 31u, 33u,
130
+ 127u, 33u, 46u, 61u, 64u, 35u, 39u, 42u,
131
+ 43u, 45u, 57u, 63u, 90u, 94u, 126u, 33u,
132
+ 46u, 61u, 64u, 35u, 39u, 42u, 43u, 45u,
133
+ 47u, 48u, 57u, 63u, 90u, 94u, 126u, 33u,
134
+ 46u, 61u, 64u, 35u, 39u, 42u, 43u, 45u,
135
+ 47u, 48u, 57u, 63u, 90u, 94u, 126u, 33u,
136
+ 46u, 61u, 64u, 35u, 39u, 42u, 43u, 45u,
137
+ 47u, 48u, 57u, 63u, 90u, 94u, 126u, 33u,
138
+ 42u, 43u, 45u, 46u, 47u, 58u, 61u, 63u,
139
+ 64u, 95u, 35u, 39u, 48u, 57u, 65u, 90u,
140
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 46u,
141
+ 61u, 63u, 64u, 35u, 39u, 42u, 43u, 45u,
142
+ 47u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
143
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 58u,
144
+ 61u, 63u, 64u, 35u, 39u, 42u, 43u, 48u,
145
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
146
+ 126u, 47u, 48u, 57u, 33u, 45u, 46u, 47u,
147
+ 58u, 61u, 63u, 64u, 122u, 35u, 39u, 42u,
148
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
149
+ 121u, 123u, 126u, 33u, 45u, 46u, 47u, 58u,
150
+ 61u, 63u, 64u, 109u, 35u, 39u, 42u, 43u,
151
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
152
+ 123u, 126u, 33u, 45u, 46u, 47u, 58u, 61u,
153
+ 63u, 64u, 117u, 35u, 39u, 42u, 43u, 48u,
154
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
155
+ 126u, 33u, 45u, 46u, 47u, 58u, 61u, 63u,
156
+ 64u, 118u, 35u, 39u, 42u, 43u, 48u, 57u,
157
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
158
+ 33u, 45u, 46u, 47u, 58u, 61u, 63u, 64u,
159
+ 102u, 116u, 35u, 39u, 42u, 43u, 48u, 57u,
160
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
161
+ 33u, 45u, 46u, 47u, 58u, 61u, 63u, 64u,
162
+ 108u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
163
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
164
+ 45u, 46u, 47u, 58u, 61u, 63u, 64u, 116u,
165
+ 35u, 39u, 42u, 43u, 48u, 57u, 65u, 90u,
166
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 45u,
167
+ 46u, 47u, 58u, 61u, 63u, 64u, 103u, 35u,
168
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
169
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
170
+ 47u, 61u, 63u, 64u, 95u, 35u, 39u, 42u,
171
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
172
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 61u,
173
+ 63u, 64u, 95u, 35u, 39u, 42u, 43u, 48u,
174
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
175
+ 126u, 33u, 45u, 46u, 47u, 61u, 63u, 64u,
176
+ 35u, 39u, 42u, 43u, 48u, 57u, 65u, 90u,
177
+ 94u, 96u, 97u, 122u, 123u, 126u, 47u, 95u,
178
+ 44u, 58u, 64u, 90u, 97u, 122u, 33u, 42u,
179
+ 43u, 45u, 46u, 47u, 58u, 61u, 63u, 64u,
180
+ 95u, 35u, 39u, 48u, 57u, 65u, 90u, 94u,
181
+ 96u, 97u, 122u, 123u, 126u, 40u, 41u, 45u,
182
+ 40u, 41u, 33u, 46u, 61u, 63u, 64u, 35u,
183
+ 39u, 42u, 43u, 45u, 57u, 65u, 90u, 94u,
184
+ 126u, 48u, 57u, 65u, 90u, 97u, 122u, 33u,
185
+ 38u, 39u, 42u, 43u, 45u, 46u, 47u, 58u,
186
+ 61u, 63u, 64u, 95u, 35u, 37u, 48u, 57u,
187
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
188
+ 33u, 46u, 61u, 63u, 64u, 35u, 39u, 42u,
189
+ 43u, 45u, 57u, 65u, 90u, 94u, 96u, 97u,
190
+ 122u, 123u, 126u, 33u, 46u, 61u, 63u, 64u,
191
+ 35u, 39u, 42u, 43u, 45u, 47u, 48u, 57u,
192
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
193
+ 33u, 46u, 61u, 63u, 64u, 83u, 115u, 35u,
194
+ 39u, 42u, 43u, 45u, 57u, 65u, 90u, 94u,
195
+ 96u, 97u, 122u, 123u, 126u, 33u, 46u, 61u,
196
+ 63u, 64u, 35u, 39u, 42u, 43u, 45u, 57u,
197
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
198
+ 33u, 45u, 46u, 47u, 61u, 63u, 64u, 35u,
199
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
200
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
201
+ 47u, 61u, 63u, 64u, 105u, 35u, 39u, 42u,
202
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
203
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 61u,
204
+ 63u, 64u, 111u, 35u, 39u, 42u, 43u, 48u,
205
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
206
+ 126u, 33u, 45u, 46u, 47u, 61u, 63u, 64u,
207
+ 100u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
208
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
209
+ 45u, 46u, 47u, 61u, 63u, 64u, 111u, 35u,
210
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
211
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
212
+ 47u, 61u, 63u, 64u, 110u, 35u, 39u, 42u,
213
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
214
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 61u,
215
+ 63u, 64u, 105u, 35u, 39u, 42u, 43u, 48u,
216
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
217
+ 126u, 33u, 45u, 46u, 47u, 61u, 63u, 64u,
218
+ 101u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
219
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
220
+ 45u, 46u, 47u, 61u, 63u, 64u, 114u, 35u,
221
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
222
+ 96u, 97u, 122u, 123u, 126u, 33u, 38u, 39u,
223
+ 42u, 43u, 45u, 46u, 47u, 58u, 61u, 63u,
224
+ 64u, 95u, 35u, 37u, 48u, 57u, 65u, 90u,
225
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 46u,
226
+ 61u, 64u, 83u, 115u, 35u, 39u, 42u, 43u,
227
+ 45u, 57u, 63u, 90u, 94u, 126u, 33u, 45u,
228
+ 61u, 63u, 91u, 35u, 39u, 42u, 43u, 47u,
229
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
230
+ 126u, 33u, 46u, 61u, 63u, 35u, 39u, 42u,
231
+ 43u, 45u, 47u, 48u, 57u, 65u, 90u, 94u,
232
+ 96u, 97u, 122u, 123u, 126u, 33u, 38u, 39u,
233
+ 42u, 43u, 45u, 46u, 47u, 58u, 61u, 63u,
234
+ 64u, 95u, 35u, 37u, 48u, 57u, 65u, 90u,
235
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 38u,
236
+ 39u, 42u, 43u, 45u, 46u, 47u, 58u, 61u,
237
+ 63u, 64u, 95u, 105u, 116u, 35u, 37u, 48u,
238
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
239
+ 126u, 33u, 38u, 39u, 42u, 43u, 45u, 46u,
240
+ 47u, 58u, 61u, 63u, 64u, 95u, 108u, 35u,
241
+ 37u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
242
+ 122u, 123u, 126u, 33u, 38u, 39u, 42u, 43u,
243
+ 45u, 46u, 47u, 58u, 61u, 63u, 64u, 95u,
244
+ 101u, 35u, 37u, 48u, 57u, 65u, 90u, 94u,
245
+ 96u, 97u, 122u, 123u, 126u, 33u, 38u, 39u,
246
+ 42u, 43u, 45u, 46u, 47u, 58u, 61u, 63u,
247
+ 64u, 95u, 35u, 37u, 48u, 57u, 65u, 90u,
248
+ 94u, 96u, 97u, 122u, 123u, 126u, 45u, 47u,
249
+ 58u, 64u, 95u, 44u, 46u, 48u, 57u, 65u,
250
+ 90u, 97u, 122u, 33u, 38u, 39u, 42u, 43u,
251
+ 45u, 46u, 47u, 58u, 61u, 63u, 64u, 95u,
252
+ 112u, 35u, 37u, 48u, 57u, 65u, 90u, 94u,
253
+ 96u, 97u, 122u, 123u, 126u, 33u, 38u, 39u,
254
+ 42u, 43u, 45u, 46u, 47u, 58u, 61u, 63u,
255
+ 64u, 95u, 116u, 35u, 37u, 48u, 57u, 65u,
256
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
257
+ 38u, 39u, 42u, 43u, 45u, 46u, 47u, 58u,
258
+ 61u, 63u, 64u, 95u, 116u, 35u, 37u, 48u,
259
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
260
+ 126u, 33u, 38u, 39u, 42u, 43u, 45u, 46u,
261
+ 47u, 58u, 61u, 63u, 64u, 95u, 112u, 35u,
262
+ 37u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
263
+ 122u, 123u, 126u, 33u, 38u, 39u, 42u, 43u,
264
+ 45u, 46u, 47u, 58u, 61u, 63u, 64u, 95u,
265
+ 115u, 35u, 37u, 48u, 57u, 65u, 90u, 94u,
266
+ 96u, 97u, 122u, 123u, 126u, 0
267
+ };
268
+
269
+ static const char _StdTok_single_lengths[] = {
270
+ 0, 0, 4, 4, 5, 4, 2, 0,
271
+ 2, 1, 0, 5, 7, 7, 7, 7,
272
+ 13, 7, 0, 8, 8, 8, 8, 8,
273
+ 8, 8, 8, 8, 5, 13, 1, 1,
274
+ 2, 13, 1, 5, 0, 13, 4, 4,
275
+ 2, 4, 4, 4, 4, 11, 5, 8,
276
+ 1, 9, 9, 9, 9, 10, 9, 9,
277
+ 9, 8, 8, 7, 2, 11, 3, 1,
278
+ 1, 5, 0, 13, 5, 5, 7, 5,
279
+ 7, 8, 8, 8, 8, 8, 8, 8,
280
+ 8, 13, 6, 5, 4, 13, 15, 14,
281
+ 14, 13, 5, 14, 14, 14, 14, 14
282
+ };
283
+
284
+ static const char _StdTok_range_lengths[] = {
285
+ 0, 0, 5, 5, 5, 5, 5, 3,
286
+ 4, 0, 3, 5, 7, 7, 7, 7,
287
+ 7, 7, 1, 7, 7, 7, 7, 7,
288
+ 7, 7, 7, 7, 8, 7, 0, 3,
289
+ 0, 7, 0, 4, 0, 6, 5, 5,
290
+ 4, 5, 6, 6, 6, 6, 8, 7,
291
+ 1, 7, 7, 7, 7, 7, 7, 7,
292
+ 7, 7, 7, 7, 3, 6, 0, 0,
293
+ 0, 5, 3, 6, 7, 8, 7, 7,
294
+ 7, 7, 7, 7, 7, 7, 7, 7,
295
+ 7, 6, 5, 7, 8, 6, 6, 6,
296
+ 6, 6, 4, 6, 6, 6, 6, 6
297
+ };
298
+
299
+ static const short _StdTok_index_offsets[] = {
300
+ 0, 0, 1, 11, 21, 32, 42, 50,
301
+ 54, 61, 63, 67, 78, 93, 108, 123,
302
+ 138, 159, 174, 176, 192, 208, 224, 240,
303
+ 256, 272, 288, 304, 320, 334, 355, 357,
304
+ 362, 365, 386, 388, 398, 399, 419, 429,
305
+ 439, 446, 456, 467, 478, 489, 507, 521,
306
+ 537, 540, 557, 574, 591, 608, 626, 643,
307
+ 660, 677, 693, 709, 724, 730, 748, 752,
308
+ 754, 756, 767, 771, 791, 804, 818, 833,
309
+ 846, 861, 877, 893, 909, 925, 941, 957,
310
+ 973, 989, 1009, 1021, 1034, 1047, 1067, 1089,
311
+ 1110, 1131, 1151, 1161, 1182, 1203, 1224, 1245
312
+ };
313
+
314
+ static const char _StdTok_indicies[] = {
315
+ 0, 2, 3, 2, 4, 2, 2, 2,
316
+ 2, 2, 1, 2, 2, 2, 2, 2,
317
+ 2, 2, 2, 2, 1, 5, 5, 5,
318
+ 5, 6, 5, 5, 5, 5, 5, 1,
319
+ 5, 5, 5, 5, 5, 5, 5, 5,
320
+ 5, 7, 8, 9, 6, 6, 6, 6,
321
+ 6, 1, 6, 6, 6, 1, 12, 13,
322
+ 11, 11, 11, 11, 10, 4, 10, 11,
323
+ 11, 11, 10, 2, 2, 2, 2, 2,
324
+ 2, 2, 15, 2, 2, 14, 2, 16,
325
+ 2, 3, 2, 2, 4, 2, 2, 17,
326
+ 17, 2, 17, 2, 1, 2, 18, 3,
327
+ 2, 2, 2, 4, 2, 2, 19, 19,
328
+ 2, 19, 2, 1, 2, 18, 3, 2,
329
+ 2, 2, 4, 2, 2, 20, 20, 2,
330
+ 20, 2, 1, 2, 18, 21, 2, 2,
331
+ 2, 4, 2, 2, 20, 20, 2, 20,
332
+ 2, 1, 2, 2, 2, 2, 2, 23,
333
+ 24, 25, 26, 27, 28, 29, 30, 2,
334
+ 2, 20, 22, 2, 22, 2, 1, 2,
335
+ 18, 21, 2, 2, 2, 4, 2, 2,
336
+ 20, 31, 2, 31, 2, 1, 33, 32,
337
+ 2, 18, 21, 2, 2, 2, 4, 34,
338
+ 2, 2, 20, 31, 2, 31, 2, 1,
339
+ 2, 18, 21, 2, 2, 2, 4, 35,
340
+ 2, 2, 20, 31, 2, 31, 2, 1,
341
+ 2, 18, 21, 2, 2, 2, 4, 36,
342
+ 2, 2, 20, 31, 2, 31, 2, 1,
343
+ 2, 18, 21, 2, 2, 2, 4, 37,
344
+ 2, 2, 20, 31, 2, 31, 2, 1,
345
+ 2, 18, 21, 2, 2, 2, 4, 38,
346
+ 2, 2, 20, 31, 2, 31, 2, 1,
347
+ 2, 18, 21, 2, 2, 2, 4, 31,
348
+ 2, 2, 20, 20, 2, 20, 2, 32,
349
+ 2, 18, 21, 2, 2, 2, 4, 39,
350
+ 2, 2, 20, 31, 2, 31, 2, 1,
351
+ 2, 18, 21, 2, 2, 2, 4, 40,
352
+ 2, 2, 20, 31, 2, 31, 2, 1,
353
+ 2, 18, 21, 2, 2, 2, 4, 41,
354
+ 2, 2, 20, 31, 2, 31, 2, 1,
355
+ 2, 3, 2, 2, 4, 2, 2, 2,
356
+ 42, 42, 2, 42, 2, 1, 2, 2,
357
+ 2, 2, 2, 23, 24, 25, 26, 27,
358
+ 28, 29, 30, 2, 2, 44, 22, 2,
359
+ 22, 2, 43, 45, 1, 46, 46, 46,
360
+ 46, 1, 47, 48, 10, 2, 2, 2,
361
+ 2, 2, 50, 51, 52, 53, 54, 55,
362
+ 56, 57, 2, 2, 20, 49, 2, 49,
363
+ 2, 1, 59, 58, 60, 61, 46, 46,
364
+ 60, 46, 60, 60, 60, 58, 62, 63,
365
+ 65, 66, 67, 68, 68, 67, 70, 67,
366
+ 71, 72, 74, 75, 67, 69, 73, 67,
367
+ 73, 67, 64, 65, 3, 2, 4, 2,
368
+ 2, 2, 2, 2, 76, 5, 78, 5,
369
+ 5, 5, 5, 5, 5, 5, 77, 12,
370
+ 13, 11, 11, 11, 11, 79, 2, 3,
371
+ 2, 4, 2, 2, 2, 2, 2, 1,
372
+ 2, 3, 2, 4, 2, 2, 2, 80,
373
+ 2, 2, 79, 2, 82, 2, 4, 2,
374
+ 2, 2, 80, 2, 2, 81, 2, 3,
375
+ 2, 4, 2, 2, 2, 15, 2, 2,
376
+ 83, 2, 16, 2, 85, 86, 2, 87,
377
+ 2, 2, 4, 89, 2, 69, 88, 2,
378
+ 88, 2, 84, 2, 3, 2, 2, 4,
379
+ 2, 2, 2, 17, 17, 2, 17, 2,
380
+ 90, 2, 18, 21, 92, 93, 2, 2,
381
+ 4, 2, 2, 20, 20, 2, 20, 2,
382
+ 91, 94, 33, 91, 2, 18, 21, 92,
383
+ 93, 2, 2, 4, 31, 2, 2, 20,
384
+ 20, 2, 20, 2, 91, 2, 18, 21,
385
+ 92, 93, 2, 2, 4, 31, 2, 2,
386
+ 20, 20, 2, 20, 2, 91, 2, 18,
387
+ 21, 92, 93, 2, 2, 4, 31, 2,
388
+ 2, 20, 20, 2, 20, 2, 91, 2,
389
+ 18, 21, 92, 93, 2, 2, 4, 31,
390
+ 2, 2, 20, 20, 2, 20, 2, 91,
391
+ 2, 18, 21, 92, 93, 2, 2, 4,
392
+ 95, 31, 2, 2, 20, 20, 2, 20,
393
+ 2, 91, 2, 18, 21, 92, 93, 2,
394
+ 2, 4, 31, 2, 2, 20, 20, 2,
395
+ 20, 2, 91, 2, 18, 21, 92, 93,
396
+ 2, 2, 4, 31, 2, 2, 20, 20,
397
+ 2, 20, 2, 91, 2, 18, 21, 92,
398
+ 93, 2, 2, 4, 31, 2, 2, 20,
399
+ 20, 2, 20, 2, 91, 2, 85, 21,
400
+ 2, 2, 2, 4, 89, 2, 2, 19,
401
+ 19, 2, 19, 2, 84, 2, 89, 3,
402
+ 2, 2, 2, 4, 89, 2, 2, 42,
403
+ 42, 2, 42, 2, 84, 2, 18, 21,
404
+ 2, 2, 2, 4, 2, 2, 44, 20,
405
+ 2, 20, 2, 83, 96, 46, 46, 46,
406
+ 46, 91, 2, 16, 2, 85, 21, 2,
407
+ 87, 2, 2, 4, 89, 2, 88, 88,
408
+ 2, 88, 2, 84, 47, 48, 97, 79,
409
+ 47, 98, 48, 99, 2, 3, 2, 71,
410
+ 4, 2, 2, 2, 2, 2, 100, 101,
411
+ 101, 101, 1, 2, 103, 104, 16, 2,
412
+ 85, 105, 2, 87, 2, 2, 107, 89,
413
+ 2, 106, 108, 2, 108, 2, 102, 2,
414
+ 3, 2, 2, 4, 2, 2, 2, 110,
415
+ 2, 110, 2, 109, 2, 3, 2, 2,
416
+ 4, 2, 2, 2, 110, 110, 2, 110,
417
+ 2, 109, 2, 3, 2, 2, 4, 113,
418
+ 113, 2, 2, 2, 112, 2, 112, 2,
419
+ 111, 2, 3, 2, 2, 4, 2, 2,
420
+ 2, 112, 2, 112, 2, 1, 2, 18,
421
+ 105, 2, 2, 2, 4, 2, 2, 20,
422
+ 31, 2, 31, 2, 114, 2, 18, 105,
423
+ 2, 2, 2, 4, 34, 2, 2, 20,
424
+ 31, 2, 31, 2, 114, 2, 18, 105,
425
+ 2, 2, 2, 4, 35, 2, 2, 20,
426
+ 31, 2, 31, 2, 114, 2, 18, 105,
427
+ 2, 2, 2, 4, 36, 2, 2, 20,
428
+ 31, 2, 31, 2, 114, 2, 18, 105,
429
+ 2, 2, 2, 4, 37, 2, 2, 20,
430
+ 31, 2, 31, 2, 114, 2, 18, 105,
431
+ 2, 2, 2, 4, 38, 2, 2, 20,
432
+ 31, 2, 31, 2, 114, 2, 18, 105,
433
+ 2, 2, 2, 4, 39, 2, 2, 20,
434
+ 31, 2, 31, 2, 114, 2, 18, 105,
435
+ 2, 2, 2, 4, 40, 2, 2, 20,
436
+ 31, 2, 31, 2, 114, 2, 18, 105,
437
+ 2, 2, 2, 4, 41, 2, 2, 20,
438
+ 31, 2, 31, 2, 114, 2, 103, 115,
439
+ 16, 2, 85, 21, 2, 87, 2, 2,
440
+ 107, 89, 2, 106, 106, 2, 106, 2,
441
+ 102, 2, 3, 2, 4, 116, 116, 2,
442
+ 2, 2, 2, 2, 111, 5, 5, 5,
443
+ 5, 6, 5, 5, 5, 117, 5, 117,
444
+ 5, 109, 5, 78, 5, 5, 5, 5,
445
+ 5, 117, 117, 5, 117, 5, 77, 2,
446
+ 103, 104, 16, 2, 85, 21, 2, 87,
447
+ 2, 2, 107, 89, 2, 106, 108, 2,
448
+ 108, 2, 102, 2, 103, 104, 16, 2,
449
+ 85, 105, 2, 87, 2, 2, 107, 89,
450
+ 118, 119, 2, 106, 108, 2, 108, 2,
451
+ 102, 2, 103, 104, 16, 2, 85, 21,
452
+ 2, 87, 2, 2, 107, 89, 120, 2,
453
+ 106, 108, 2, 108, 2, 102, 2, 103,
454
+ 104, 16, 2, 85, 21, 2, 87, 2,
455
+ 2, 107, 89, 121, 2, 106, 108, 2,
456
+ 108, 2, 102, 2, 103, 104, 16, 2,
457
+ 85, 21, 2, 122, 2, 2, 107, 89,
458
+ 2, 106, 108, 2, 108, 2, 102, 60,
459
+ 123, 46, 46, 60, 46, 60, 60, 60,
460
+ 91, 2, 103, 104, 16, 2, 85, 21,
461
+ 2, 87, 2, 2, 107, 89, 121, 2,
462
+ 106, 108, 2, 108, 2, 102, 2, 103,
463
+ 104, 16, 2, 85, 105, 2, 87, 2,
464
+ 2, 107, 89, 124, 2, 106, 108, 2,
465
+ 108, 2, 102, 2, 103, 104, 16, 2,
466
+ 85, 21, 2, 87, 2, 2, 107, 89,
467
+ 125, 2, 106, 108, 2, 108, 2, 102,
468
+ 2, 103, 104, 16, 2, 85, 21, 2,
469
+ 87, 2, 2, 107, 89, 126, 2, 106,
470
+ 108, 2, 108, 2, 102, 2, 103, 104,
471
+ 16, 2, 85, 21, 2, 122, 2, 2,
472
+ 107, 89, 121, 2, 106, 108, 2, 108,
473
+ 2, 102, 0
474
+ };
475
+
476
+ static const char _StdTok_trans_targs[] = {
477
+ 36, 37, 2, 3, 4, 39, 6, 37,
478
+ 7, 37, 37, 8, 9, 10, 37, 44,
479
+ 12, 46, 14, 57, 15, 16, 17, 19,
480
+ 20, 21, 22, 23, 25, 26, 27, 47,
481
+ 37, 48, 49, 50, 51, 52, 53, 54,
482
+ 55, 56, 58, 37, 59, 31, 60, 63,
483
+ 64, 72, 73, 74, 75, 76, 77, 78,
484
+ 79, 80, 37, 35, 60, 90, 0, 37,
485
+ 37, 38, 40, 41, 42, 45, 62, 65,
486
+ 66, 67, 86, 92, 37, 37, 5, 37,
487
+ 43, 37, 11, 37, 37, 13, 29, 30,
488
+ 61, 28, 37, 37, 41, 18, 37, 24,
489
+ 60, 32, 37, 37, 37, 66, 37, 68,
490
+ 70, 33, 81, 83, 85, 37, 69, 37,
491
+ 71, 71, 37, 82, 41, 84, 87, 91,
492
+ 88, 89, 34, 90, 93, 94, 95
493
+ };
494
+
495
+ static const char _StdTok_trans_actions[] = {
496
+ 5, 61, 0, 0, 0, 11, 0, 49,
497
+ 0, 13, 59, 0, 0, 0, 57, 105,
498
+ 0, 75, 0, 90, 0, 0, 0, 0,
499
+ 0, 0, 0, 0, 0, 0, 0, 96,
500
+ 55, 0, 96, 96, 96, 96, 96, 96,
501
+ 96, 96, 90, 53, 105, 0, 0, 0,
502
+ 0, 99, 99, 99, 99, 99, 99, 99,
503
+ 99, 99, 51, 0, 1, 0, 0, 15,
504
+ 17, 69, 108, 108, 108, 90, 11, 66,
505
+ 108, 78, 78, 78, 21, 29, 0, 47,
506
+ 102, 43, 0, 45, 35, 0, 0, 0,
507
+ 90, 0, 27, 39, 111, 0, 63, 0,
508
+ 3, 0, 25, 23, 19, 72, 31, 93,
509
+ 81, 0, 78, 93, 78, 37, 93, 33,
510
+ 87, 84, 41, 81, 84, 11, 78, 78,
511
+ 78, 78, 0, 3, 78, 78, 78
512
+ };
513
+
514
+ static const char _StdTok_to_state_actions[] = {
515
+ 0, 7, 0, 0, 0, 0, 0, 0,
516
+ 0, 0, 0, 0, 0, 0, 0, 0,
517
+ 0, 0, 0, 0, 0, 0, 0, 0,
518
+ 0, 0, 0, 0, 0, 0, 0, 0,
519
+ 0, 0, 0, 0, 7, 7, 0, 0,
520
+ 0, 0, 0, 0, 0, 0, 0, 0,
521
+ 0, 0, 0, 0, 0, 0, 0, 0,
522
+ 0, 0, 0, 0, 0, 0, 0, 0,
523
+ 0, 0, 0, 0, 0, 0, 0, 0,
524
+ 0, 0, 0, 0, 0, 0, 0, 0,
525
+ 0, 0, 0, 0, 0, 0, 0, 0,
526
+ 0, 0, 0, 0, 0, 0, 0, 0
527
+ };
528
+
529
+ static const char _StdTok_from_state_actions[] = {
530
+ 0, 0, 0, 0, 0, 0, 0, 0,
531
+ 0, 0, 0, 0, 0, 0, 0, 0,
532
+ 0, 0, 0, 0, 0, 0, 0, 0,
533
+ 0, 0, 0, 0, 0, 0, 0, 0,
534
+ 0, 0, 0, 0, 0, 9, 0, 0,
535
+ 0, 0, 0, 0, 0, 0, 0, 0,
536
+ 0, 0, 0, 0, 0, 0, 0, 0,
537
+ 0, 0, 0, 0, 0, 0, 0, 0,
538
+ 0, 0, 0, 0, 0, 0, 0, 0,
539
+ 0, 0, 0, 0, 0, 0, 0, 0,
540
+ 0, 0, 0, 0, 0, 0, 0, 0,
541
+ 0, 0, 0, 0, 0, 0, 0, 0
542
+ };
543
+
544
+ static const short _StdTok_eof_trans[] = {
545
+ 0, 0, 2, 2, 2, 8, 2, 2,
546
+ 11, 11, 11, 15, 2, 2, 2, 2,
547
+ 2, 2, 33, 2, 2, 2, 2, 2,
548
+ 33, 2, 2, 2, 2, 44, 2, 2,
549
+ 11, 2, 59, 59, 0, 0, 77, 78,
550
+ 80, 2, 80, 82, 84, 85, 91, 92,
551
+ 92, 92, 92, 92, 92, 92, 92, 92,
552
+ 92, 85, 85, 84, 92, 85, 80, 99,
553
+ 100, 101, 2, 103, 110, 110, 112, 2,
554
+ 115, 115, 115, 115, 115, 115, 115, 115,
555
+ 115, 103, 112, 110, 78, 103, 103, 103,
556
+ 103, 103, 92, 103, 103, 103, 103, 103
557
+ };
558
+
559
+ static const int StdTok_start = 1;
560
+ static const int StdTok_error = 0;
561
+
562
+ static const int StdTok_en_frt_tokenizer = 37;
563
+ static const int StdTok_en_main = 1;
564
+
565
+ #line 29 "scanner.rl"
566
+
567
+ void frt_std_scan(const char *in,
568
+ char *out, size_t out_size,
569
+ const char **start,
570
+ const char **end,
571
+ int *token_size)
572
+ {
573
+ int cs, act, top;
574
+ int stack[32];
575
+ char *ts = 0, *te = 0;
576
+
577
+
578
+ #line 579 "scanner.c"
579
+ {
580
+ cs = StdTok_start;
581
+ top = 0;
582
+ ts = 0;
583
+ te = 0;
584
+ act = 0;
585
+ }
586
+ #line 41 "scanner.rl"
587
+
588
+ char *p = (char *)in, *pe = 0, *eof = pe;
589
+ int skip = 0;
590
+ int trunc = 0;
591
+ char strip_char = 0;
592
+
593
+ *end = 0;
594
+ *start = 0;
595
+ *token_size = 0;
596
+
597
+
598
+ #line 599 "scanner.c"
599
+ {
600
+ int _klen;
601
+ unsigned int _trans;
602
+ const char *_acts;
603
+ unsigned int _nacts;
604
+ const unsigned char *_keys;
605
+
606
+ if ( p == pe )
607
+ goto _test_eof;
608
+ if ( cs == 0 )
609
+ goto _out;
610
+ _resume:
611
+ _acts = _StdTok_actions + _StdTok_from_state_actions[cs];
612
+ _nacts = (unsigned int) *_acts++;
613
+ while ( _nacts-- > 0 ) {
614
+ switch ( *_acts++ ) {
615
+ case 4:
616
+ #line 1 "scanner.rl"
617
+ {ts = p;}
618
+ break;
619
+ #line 620 "scanner.c"
620
+ }
621
+ }
622
+
623
+ _keys = _StdTok_trans_keys + _StdTok_key_offsets[cs];
624
+ _trans = _StdTok_index_offsets[cs];
625
+
626
+ _klen = _StdTok_single_lengths[cs];
627
+ if ( _klen > 0 ) {
628
+ const unsigned char *_lower = _keys;
629
+ const unsigned char *_mid;
630
+ const unsigned char *_upper = _keys + _klen - 1;
631
+ while (1) {
632
+ if ( _upper < _lower )
633
+ break;
634
+
635
+ _mid = _lower + ((_upper-_lower) >> 1);
636
+ if ( (*p) < *_mid )
637
+ _upper = _mid - 1;
638
+ else if ( (*p) > *_mid )
639
+ _lower = _mid + 1;
640
+ else {
641
+ _trans += (_mid - _keys);
642
+ goto _match;
643
+ }
644
+ }
645
+ _keys += _klen;
646
+ _trans += _klen;
647
+ }
648
+
649
+ _klen = _StdTok_range_lengths[cs];
650
+ if ( _klen > 0 ) {
651
+ const unsigned char *_lower = _keys;
652
+ const unsigned char *_mid;
653
+ const unsigned char *_upper = _keys + (_klen<<1) - 2;
654
+ while (1) {
655
+ if ( _upper < _lower )
656
+ break;
657
+
658
+ _mid = _lower + (((_upper-_lower) >> 1) & ~1);
659
+ if ( (*p) < _mid[0] )
660
+ _upper = _mid - 2;
661
+ else if ( (*p) > _mid[1] )
662
+ _lower = _mid + 2;
663
+ else {
664
+ _trans += ((_mid - _keys)>>1);
665
+ goto _match;
666
+ }
667
+ }
668
+ _trans += _klen;
669
+ }
670
+
671
+ _match:
672
+ _trans = _StdTok_indicies[_trans];
673
+ _eof_trans:
674
+ cs = _StdTok_trans_targs[_trans];
675
+
676
+ if ( _StdTok_trans_actions[_trans] == 0 )
677
+ goto _again;
678
+
679
+ _acts = _StdTok_actions + _StdTok_trans_actions[_trans];
680
+ _nacts = (unsigned int) *_acts++;
681
+ while ( _nacts-- > 0 )
682
+ {
683
+ switch ( *_acts++ )
684
+ {
685
+ case 0:
686
+ #line 14 "scanner.rl"
687
+ { skip = p - ts; }
688
+ break;
689
+ case 1:
690
+ #line 26 "scanner.rl"
691
+ { trunc = 1; }
692
+ break;
693
+ case 2:
694
+ #line 25 "scanner.rl"
695
+ { p--; {stack[top++] = cs; cs = 37; goto _again;} }
696
+ break;
697
+ case 5:
698
+ #line 1 "scanner.rl"
699
+ {te = p+1;}
700
+ break;
701
+ case 6:
702
+ #line 12 "scanner.rl"
703
+ {act = 1;}
704
+ break;
705
+ case 7:
706
+ #line 14 "scanner.rl"
707
+ {act = 2;}
708
+ break;
709
+ case 8:
710
+ #line 16 "scanner.rl"
711
+ {act = 3;}
712
+ break;
713
+ case 9:
714
+ #line 22 "scanner.rl"
715
+ {act = 6;}
716
+ break;
717
+ case 10:
718
+ #line 28 "scanner.rl"
719
+ {act = 8;}
720
+ break;
721
+ case 11:
722
+ #line 29 "scanner.rl"
723
+ {act = 9;}
724
+ break;
725
+ case 12:
726
+ #line 30 "scanner.rl"
727
+ {act = 10;}
728
+ break;
729
+ case 13:
730
+ #line 33 "scanner.rl"
731
+ {act = 11;}
732
+ break;
733
+ case 14:
734
+ #line 36 "scanner.rl"
735
+ {act = 12;}
736
+ break;
737
+ case 15:
738
+ #line 39 "scanner.rl"
739
+ {act = 13;}
740
+ break;
741
+ case 16:
742
+ #line 42 "scanner.rl"
743
+ {act = 14;}
744
+ break;
745
+ case 17:
746
+ #line 45 "scanner.rl"
747
+ {act = 15;}
748
+ break;
749
+ case 18:
750
+ #line 48 "scanner.rl"
751
+ {act = 16;}
752
+ break;
753
+ case 19:
754
+ #line 49 "scanner.rl"
755
+ {act = 17;}
756
+ break;
757
+ case 20:
758
+ #line 53 "scanner.rl"
759
+ {act = 19;}
760
+ break;
761
+ case 21:
762
+ #line 25 "scanner.rl"
763
+ {te = p+1;{ RET; }}
764
+ break;
765
+ case 22:
766
+ #line 42 "scanner.rl"
767
+ {te = p+1;{ RET; }}
768
+ break;
769
+ case 23:
770
+ #line 52 "scanner.rl"
771
+ {te = p+1;{ return; }}
772
+ break;
773
+ case 24:
774
+ #line 53 "scanner.rl"
775
+ {te = p+1;}
776
+ break;
777
+ case 25:
778
+ #line 12 "scanner.rl"
779
+ {te = p;p--;{ RET; }}
780
+ break;
781
+ case 26:
782
+ #line 14 "scanner.rl"
783
+ {te = p;p--;{ RET; }}
784
+ break;
785
+ case 27:
786
+ #line 18 "scanner.rl"
787
+ {te = p;p--;{ RET; }}
788
+ break;
789
+ case 28:
790
+ #line 20 "scanner.rl"
791
+ {te = p;p--;{ RET; }}
792
+ break;
793
+ case 29:
794
+ #line 22 "scanner.rl"
795
+ {te = p;p--;{ RET; }}
796
+ break;
797
+ case 30:
798
+ #line 25 "scanner.rl"
799
+ {te = p;p--;{ RET; }}
800
+ break;
801
+ case 31:
802
+ #line 28 "scanner.rl"
803
+ {te = p;p--;{ RET; }}
804
+ break;
805
+ case 32:
806
+ #line 29 "scanner.rl"
807
+ {te = p;p--;{ trunc = 1; RET; }}
808
+ break;
809
+ case 33:
810
+ #line 36 "scanner.rl"
811
+ {te = p;p--;{ RET; }}
812
+ break;
813
+ case 34:
814
+ #line 39 "scanner.rl"
815
+ {te = p;p--;{ RET; }}
816
+ break;
817
+ case 35:
818
+ #line 42 "scanner.rl"
819
+ {te = p;p--;{ RET; }}
820
+ break;
821
+ case 36:
822
+ #line 45 "scanner.rl"
823
+ {te = p;p--;{ STRIP('.'); }}
824
+ break;
825
+ case 37:
826
+ #line 48 "scanner.rl"
827
+ {te = p;p--;{ RET; }}
828
+ break;
829
+ case 38:
830
+ #line 49 "scanner.rl"
831
+ {te = p;p--;{ RET; }}
832
+ break;
833
+ case 39:
834
+ #line 53 "scanner.rl"
835
+ {te = p;p--;}
836
+ break;
837
+ case 40:
838
+ #line 25 "scanner.rl"
839
+ {{p = ((te))-1;}{ RET; }}
840
+ break;
841
+ case 41:
842
+ #line 28 "scanner.rl"
843
+ {{p = ((te))-1;}{ RET; }}
844
+ break;
845
+ case 42:
846
+ #line 36 "scanner.rl"
847
+ {{p = ((te))-1;}{ RET; }}
848
+ break;
849
+ case 43:
850
+ #line 42 "scanner.rl"
851
+ {{p = ((te))-1;}{ RET; }}
852
+ break;
853
+ case 44:
854
+ #line 48 "scanner.rl"
855
+ {{p = ((te))-1;}{ RET; }}
856
+ break;
857
+ case 45:
858
+ #line 53 "scanner.rl"
859
+ {{p = ((te))-1;}}
860
+ break;
861
+ case 46:
862
+ #line 1 "scanner.rl"
863
+ { switch( act ) {
864
+ case 1:
865
+ {{p = ((te))-1;} RET; }
866
+ break;
867
+ case 2:
868
+ {{p = ((te))-1;} RET; }
869
+ break;
870
+ case 3:
871
+ {{p = ((te))-1;} RET; }
872
+ break;
873
+ case 6:
874
+ {{p = ((te))-1;} RET; }
875
+ break;
876
+ case 8:
877
+ {{p = ((te))-1;} RET; }
878
+ break;
879
+ case 9:
880
+ {{p = ((te))-1;} trunc = 1; RET; }
881
+ break;
882
+ case 10:
883
+ {{p = ((te))-1;} trunc = 2; RET; }
884
+ break;
885
+ case 11:
886
+ {{p = ((te))-1;} RET; }
887
+ break;
888
+ case 12:
889
+ {{p = ((te))-1;} RET; }
890
+ break;
891
+ case 13:
892
+ {{p = ((te))-1;} RET; }
893
+ break;
894
+ case 14:
895
+ {{p = ((te))-1;} RET; }
896
+ break;
897
+ case 15:
898
+ {{p = ((te))-1;} STRIP('.'); }
899
+ break;
900
+ case 16:
901
+ {{p = ((te))-1;} RET; }
902
+ break;
903
+ case 17:
904
+ {{p = ((te))-1;} RET; }
905
+ break;
906
+ case 19:
907
+ {{p = ((te))-1;}}
908
+ break;
909
+ }
910
+ }
911
+ break;
912
+ #line 913 "scanner.c"
913
+ }
914
+ }
915
+
916
+ _again:
917
+ _acts = _StdTok_actions + _StdTok_to_state_actions[cs];
918
+ _nacts = (unsigned int) *_acts++;
919
+ while ( _nacts-- > 0 ) {
920
+ switch ( *_acts++ ) {
921
+ case 3:
922
+ #line 1 "scanner.rl"
923
+ {ts = 0;}
924
+ break;
925
+ #line 926 "scanner.c"
926
+ }
927
+ }
928
+
929
+ if ( cs == 0 )
930
+ goto _out;
931
+ if ( ++p != pe )
932
+ goto _resume;
933
+ _test_eof: {}
934
+ if ( p == eof )
935
+ {
936
+ if ( _StdTok_eof_trans[cs] > 0 ) {
937
+ _trans = _StdTok_eof_trans[cs] - 1;
938
+ goto _eof_trans;
939
+ }
940
+ }
941
+
942
+ _out: {}
943
+ }
944
+ #line 52 "scanner.rl"
945
+
946
+ if ( cs == StdTok_error )
947
+ fprintf(stderr, "PARSE ERROR\n" );
948
+ else if ( ts ) fprintf(stderr, "STUFF LEFT: '%s'\n", ts);
949
+ return;
950
+
951
+ ret:
952
+ {
953
+ size_t __len = te - ts - skip - trunc;
954
+ if (__len > out_size)
955
+ __len = out_size;
956
+
957
+ *start = ts;
958
+ *end = te;
959
+
960
+ if (strip_char) {
961
+ char *__p = ts + skip;
962
+ char *__o = out;
963
+ for (; __p < (ts + skip + __len); ++__p) {
964
+ if (*__p != strip_char)
965
+ *__o++ = *__p;
966
+ }
967
+ *token_size = __o - out;
968
+ }
969
+ else {
970
+ memcpy(out, ts + skip, __len);
971
+ *token_size = __len;
972
+ }
973
+
974
+ out[*token_size] = 0;
975
+ }
976
+ }
data/ext/scanner.in ADDED
@@ -0,0 +1,56 @@
1
+ #// scanner.in -*-C-*-
2
+
3
+ %%{
4
+ machine StdTok;
5
+ include URL "url.rl";
6
+ include Email "email.rl";
7
+
8
+ token = frt_alpha frt_alnum*;
9
+
10
+ frt_tokenizer := |*
11
+ #// question_mark
12
+ ('?')+ { RET; };
13
+ #// exclamation_mark
14
+ ('!')+ { RET; };
15
+ #// twitter_user
16
+ '@' (alnum)+ { RET; };
17
+ #// smile_face
18
+ ':' ('-')? (')')+ { RET; };
19
+ #// angry_face
20
+ ':' ('-')? ('(')+ { RET; };
21
+ #// fword
22
+ alnum+ ('*')+ alnum+ { RET; };
23
+
24
+ #// Email
25
+ email { RET; };
26
+
27
+ #// Token, or token with possessive
28
+ token { RET; };
29
+ token [\'] { trunc = 1; RET; };
30
+ token [\'][sS] { trunc = 2; RET; };
31
+
32
+ #// contractions
33
+ frt_alpha+ [\'] frt_alpha+ { RET; };
34
+
35
+ #// Token with hyphens
36
+ frt_alnum+ ([\-_] frt_alnum+)* { RET; };
37
+
38
+ #// Company name
39
+ token [\&\@] token* { RET; };
40
+
41
+ #// URL
42
+ url { RET; };
43
+
44
+ #// Acronym
45
+ (frt_alpha '.')+ frt_alpha { STRIP('.'); };
46
+
47
+ #// Int+float
48
+ [\-\+]?frt_digit+ { RET; };
49
+ [\-\+]?frt_digit+ '.' frt_digit+ { RET; };
50
+
51
+ #// Ignore whitespace and other crap
52
+ 0 { return; };
53
+ (any - frt_alnum) {};
54
+
55
+ *|;
56
+ }%%
data/ext/scanner.rl ADDED
@@ -0,0 +1,83 @@
1
+ /* scanner.rl -*-C-*- */
2
+ #include <ctype.h>
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+ #include <string.h>
6
+ #include <unistd.h>
7
+
8
+ #define RET goto ret;
9
+
10
+ #define STRIP(c) do { \
11
+ strip_char = c; \
12
+ goto ret; \
13
+ } while(0)
14
+
15
+ %%{
16
+ machine StdTok;
17
+ alphtype unsigned char;
18
+
19
+ frt_alpha = alpha;
20
+ frt_alnum = alnum;
21
+ frt_digit = digit;
22
+
23
+ include StdTok "scanner.in";
24
+
25
+ main := any @{ fhold; fcall frt_tokenizer; };
26
+ }%%
27
+
28
+ %% write data nofinal;
29
+
30
+ void frt_std_scan(const char *in,
31
+ char *out, size_t out_size,
32
+ const char **start,
33
+ const char **end,
34
+ int *token_size)
35
+ {
36
+ int cs, act, top;
37
+ int stack[32];
38
+ char *ts = 0, *te = 0;
39
+
40
+ %% write init;
41
+
42
+ char *p = (char *)in, *pe = 0, *eof = pe;
43
+ int skip = 0;
44
+ int trunc = 0;
45
+ char strip_char = 0;
46
+
47
+ *end = 0;
48
+ *start = 0;
49
+ *token_size = 0;
50
+
51
+ %% write exec;
52
+
53
+ if ( cs == StdTok_error )
54
+ fprintf(stderr, "PARSE ERROR\n" );
55
+ else if ( ts ) fprintf(stderr, "STUFF LEFT: '%s'\n", ts);
56
+ return;
57
+
58
+ ret:
59
+ {
60
+ size_t __len = te - ts - skip - trunc;
61
+ if (__len > out_size)
62
+ __len = out_size;
63
+
64
+ *start = ts;
65
+ *end = te;
66
+
67
+ if (strip_char) {
68
+ char *__p = ts + skip;
69
+ char *__o = out;
70
+ for (; __p < (ts + skip + __len); ++__p) {
71
+ if (*__p != strip_char)
72
+ *__o++ = *__p;
73
+ }
74
+ *token_size = __o - out;
75
+ }
76
+ else {
77
+ memcpy(out, ts + skip, __len);
78
+ *token_size = __len;
79
+ }
80
+
81
+ out[*token_size] = 0;
82
+ }
83
+ }
data/ext/tokenizer.c ADDED
@@ -0,0 +1,67 @@
1
+ #include <ruby.h>
2
+
3
+ extern void frt_std_scan(const char *in,
4
+ char *out, size_t out_size,
5
+ const char **start,
6
+ const char **end,
7
+ int *token_size);
8
+
9
+ typedef struct _Tokenizer {
10
+ char *data;
11
+ } Tokenizer;
12
+
13
+ Tokenizer *new_tokenizer(void)
14
+ {
15
+ return (Tokenizer*)malloc(sizeof(Tokenizer));
16
+ }
17
+
18
+ void free_tokenizer(Tokenizer *t)
19
+ {
20
+ free(t);
21
+ }
22
+
23
+ static VALUE tokenizer_alloc(VALUE klass) {
24
+ Tokenizer *tokenizer;
25
+ VALUE obj;
26
+ // Vendor library creates the Jukebox
27
+ tokenizer = new_tokenizer();
28
+ // then we wrap it inside a Ruby CDPlayer object
29
+ obj = Data_Wrap_Struct(klass, 0, free_tokenizer, tokenizer);
30
+ return obj;
31
+ }
32
+
33
+ VALUE method_next(VALUE self) {
34
+ const char *start = NULL;
35
+ const char *end = NULL;
36
+ int len;
37
+
38
+ Tokenizer *t;
39
+ Data_Get_Struct(self, Tokenizer, t);
40
+
41
+ char buffer[1024];
42
+
43
+ frt_std_scan(t->data, buffer, sizeof(buffer) - 1,
44
+ &start, &end, &len);
45
+ if (len == 0)
46
+ return Qnil;
47
+
48
+ t->data = end;
49
+ return rb_str_new2(buffer);
50
+ }
51
+
52
+ VALUE method_initialize(VALUE self, VALUE str) {
53
+ char* s = RSTRING(str)->ptr;
54
+ Tokenizer *t;
55
+ Data_Get_Struct(self, Tokenizer, t);
56
+ t->data = s;
57
+ return self;
58
+ }
59
+
60
+ VALUE cTokenizer = Qnil;
61
+
62
+ void Init_tokenizer() {
63
+ cTokenizer = rb_define_class("Tokenizer", rb_cObject);
64
+ rb_define_alloc_func(cTokenizer, tokenizer_alloc);
65
+ rb_define_method(cTokenizer, "initialize", method_initialize, 1);
66
+ rb_define_method(cTokenizer, "next", method_next, 0);
67
+ }
data/ext/url.rl ADDED
@@ -0,0 +1,27 @@
1
+ #// url.rl -*-C-*-
2
+
3
+ %%{
4
+ machine URL;
5
+
6
+ uword = [_] | alnum;
7
+ dword = '-' | uword;
8
+ dalnum = '-' | alnum;
9
+ proto = 'http'[s]? | 'ftp' | 'file';
10
+ urlc = alnum | [.,\/_\-\@\:];
11
+
12
+ url =
13
+ (
14
+ proto [:][/]+ %{ skip = p - ts; } dword+ ([.] uword dword*)+ |
15
+ alnum+ [:][/]+ urlc+ |
16
+
17
+ (alnum (dalnum* alnum)? [.])+ #// Subdomains
18
+ ('com' |'edu' | 'biz' | 'gov' |
19
+ 'int' | 'info' | 'mil' | 'net' |
20
+ 'org' | alpha{2})
21
+ )
22
+
23
+ #// Port
24
+ ( [:] digit+ )?
25
+
26
+ ([/]? @{ trunc = 1; });
27
+ }%%
data/test/test.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'ext/tokenizer'
2
+
3
+ t = Tokenizer.new '@feedbackmine sh*t, I LOVE this!!! so funny:-)'
4
+ while (tok = t.next)
5
+ p tok
6
+ end
7
+
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: feedbackmine-tokenizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - FeedbackMine
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-03-23 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.8.3
24
+ version:
25
+ description: tokenizer
26
+ email: feedbackmine@feedbackmine.com
27
+ executables: []
28
+
29
+ extensions:
30
+ - ext/extconf.rb
31
+ extra_rdoc_files:
32
+ - Manifest.txt
33
+ files:
34
+ - ext/tokenizer.c
35
+ - ext/scanner.rl
36
+ - ext/url.rl
37
+ - ext/extconf.rb
38
+ - ext/scanner.c
39
+ - ext/email.rl
40
+ - ext/scanner.in
41
+ - Rakefile
42
+ - Manifest.txt
43
+ - test
44
+ - test/test.rb
45
+ has_rdoc: true
46
+ homepage: http://www.tweetjobsearch.com
47
+ post_install_message:
48
+ rdoc_options:
49
+ - --main
50
+ - README.txt
51
+ require_paths:
52
+ - ext
53
+ required_ruby_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: "0"
58
+ version:
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: "0"
64
+ version:
65
+ requirements: []
66
+
67
+ rubyforge_project: tokenizer
68
+ rubygems_version: 1.2.0
69
+ signing_key:
70
+ specification_version: 2
71
+ summary: tokenizer
72
+ test_files: []
73
+