feedbackmine-ferret_tokenizer 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ ext/ferret_tokenizer.c
2
+ ext/scanner.rl
3
+ ext/url.rl
4
+ ext/extconf.rb
5
+ ext/scanner.c
6
+ ext/email.rl
7
+ ext/scanner.in
8
+ Rakefile
9
+ Manifest.txt
10
+ README.txt
11
+ test
12
+ test/test.rb
13
+
@@ -0,0 +1,20 @@
1
+ == Overview
2
+ A string tokenizer based on Ferret::Analysis::StandardAnalyzer. I made some improvements for better text analysis.
3
+
4
+ == Install
5
+
6
+ sudo gem sources -a http://gems.github.com (you only have to do this once)
7
+ sudo gem install feedbackmine-tokenizer
8
+
9
+ == Usage
10
+
11
+ require 'tokenizer'
12
+ t = Tokenizer.new '@feedbackmine, I LOVE this!!! :-)'
13
+ while (tok = t.next)
14
+ p tok
15
+ end
16
+
17
+ == Follow us on twitter
18
+
19
+ http://twitter.com/feedbackmine
20
+
@@ -0,0 +1,25 @@
1
+ require 'rubygems'
2
+ require 'hoe'
3
+
4
+ EXT = "ext/ferret_tokenizer.#{Hoe::DLEXT}"
5
+
6
+ Hoe.new('ferret_tokenizer', '0.1.1') do |p|
7
+ p.author = 'FeedbackMine'
8
+ p.email = 'feedbackmine@feedbackmine.com'
9
+ p.url = 'http://www.tweetjobsearch.com'
10
+ p.summary = 'tokenizer'
11
+ p.description = 'tokenizer'
12
+
13
+ p.spec_extras[:extensions] = "ext/extconf.rb"
14
+ p.clean_globs << EXT << "ext/*.o" << "ext/Makefile"
15
+ end
16
+
17
+ task :test => EXT
18
+
19
+ file EXT => ["ext/extconf.rb", "ext/ferret_tokenizer.c"] do
20
+ Dir.chdir "ext" do
21
+ sh "ragel scanner.rl -o scanner.c"
22
+ ruby "extconf.rb"
23
+ sh "make"
24
+ end
25
+ end
@@ -0,0 +1,21 @@
1
+ #// email.rl -*-C-*-
2
+ %%{
3
+ machine Email;
4
+
5
+ #// RFC 2822 - matching email addresses
6
+ NO_WS_CTL = ( 1..8 | 11 | 12 | 14..31 | 127 );
7
+ ASCII = 1..127;
8
+ atext = [a-zA-Z0-9!#$%&\'*+\-/=?^_`{|}~];
9
+ qtext = ( NO_WS_CTL | 33 | 35..91 | 93..126 );
10
+ dtext = ( NO_WS_CTL | 33..90 | 94..126 );
11
+ dot_atom = atext+ ('.' atext+)*;
12
+ text = ( 1..9 | 11 | 12 | 14..127 );
13
+ quoted_pair = '\\' text;
14
+ quoted_string = '"' ( qtext | quoted_pair )* '"';
15
+ domain_literal = '[' (dtext | quoted_pair)* ']';
16
+
17
+ local_part = dot_atom | quoted_string;
18
+ domain = dot_atom | domain_literal;
19
+
20
+ email = local_part '@' domain;
21
+ }%%
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+ $CFLAGS = "#{ENV['CFLAGS']} -Wall -O3 "
3
+ create_makefile('ferret_tokenizer')
@@ -0,0 +1,67 @@
1
+ #include <ruby.h>
2
+
3
+ extern void frt_std_scan(const char *in,
4
+ char *out, size_t out_size,
5
+ const char **start,
6
+ const char **end,
7
+ int *token_size);
8
+
9
+ typedef struct _Tokenizer {
10
+ char *data;
11
+ } Tokenizer;
12
+
13
+ Tokenizer *new_tokenizer(void)
14
+ {
15
+ return (Tokenizer*)malloc(sizeof(Tokenizer));
16
+ }
17
+
18
+ void free_tokenizer(Tokenizer *t)
19
+ {
20
+ free(t);
21
+ }
22
+
23
+ static VALUE tokenizer_alloc(VALUE klass) {
24
+ Tokenizer *tokenizer;
25
+ VALUE obj;
26
+ // Vendor library creates the Jukebox
27
+ tokenizer = new_tokenizer();
28
+ // then we wrap it inside a Ruby CDPlayer object
29
+ obj = Data_Wrap_Struct(klass, 0, free_tokenizer, tokenizer);
30
+ return obj;
31
+ }
32
+
33
+ VALUE method_next(VALUE self) {
34
+ const char *start = NULL;
35
+ const char *end = NULL;
36
+ int len;
37
+
38
+ Tokenizer *t;
39
+ Data_Get_Struct(self, Tokenizer, t);
40
+
41
+ char buffer[1024];
42
+
43
+ frt_std_scan(t->data, buffer, sizeof(buffer) - 1,
44
+ &start, &end, &len);
45
+ if (len == 0)
46
+ return Qnil;
47
+
48
+ t->data = end;
49
+ return rb_str_new2(buffer);
50
+ }
51
+
52
+ VALUE method_initialize(VALUE self, VALUE str) {
53
+ char* s = RSTRING(str)->ptr;
54
+ Tokenizer *t;
55
+ Data_Get_Struct(self, Tokenizer, t);
56
+ t->data = s;
57
+ return self;
58
+ }
59
+
60
+ VALUE cTokenizer = Qnil;
61
+
62
+ void Init_ferret_tokenizer() {
63
+ cTokenizer = rb_define_class("FerretTokenizer", rb_cObject);
64
+ rb_define_alloc_func(cTokenizer, tokenizer_alloc);
65
+ rb_define_method(cTokenizer, "initialize", method_initialize, 1);
66
+ rb_define_method(cTokenizer, "next", method_next, 0);
67
+ }
@@ -0,0 +1,976 @@
1
+ #line 1 "scanner.rl"
2
+ /* scanner.rl -*-C-*- */
3
+ #include <ctype.h>
4
+ #include <stdio.h>
5
+ #include <stdlib.h>
6
+ #include <string.h>
7
+ #include <unistd.h>
8
+
9
+ #define RET goto ret;
10
+
11
+ #define STRIP(c) do { \
12
+ strip_char = c; \
13
+ goto ret; \
14
+ } while(0)
15
+
16
+ #line 26 "scanner.rl"
17
+
18
+
19
+
20
+ #line 21 "scanner.c"
21
+ static const char _StdTok_actions[] = {
22
+ 0, 1, 0, 1, 1, 1, 2, 1,
23
+ 3, 1, 4, 1, 5, 1, 21, 1,
24
+ 23, 1, 24, 1, 25, 1, 26, 1,
25
+ 27, 1, 28, 1, 29, 1, 30, 1,
26
+ 31, 1, 32, 1, 33, 1, 34, 1,
27
+ 35, 1, 36, 1, 37, 1, 38, 1,
28
+ 39, 1, 40, 1, 41, 1, 42, 1,
29
+ 43, 1, 44, 1, 45, 1, 46, 2,
30
+ 1, 22, 2, 5, 6, 2, 5, 7,
31
+ 2, 5, 8, 2, 5, 9, 2, 5,
32
+ 10, 2, 5, 11, 2, 5, 12, 2,
33
+ 5, 13, 2, 5, 14, 2, 5, 15,
34
+ 2, 5, 16, 2, 5, 17, 2, 5,
35
+ 18, 2, 5, 19, 2, 5, 20, 3,
36
+ 5, 1, 16
37
+ };
38
+
39
+ static const short _StdTok_key_offsets[] = {
40
+ 0, 0, 0, 14, 28, 43, 57, 69,
41
+ 75, 85, 86, 92, 107, 128, 149, 170,
42
+ 191, 218, 239, 241, 263, 285, 307, 329,
43
+ 351, 373, 395, 417, 439, 460, 487, 488,
44
+ 495, 497, 524, 525, 538, 538, 563, 577,
45
+ 591, 601, 615, 631, 647, 663, 686, 707,
46
+ 729, 732, 755, 778, 801, 824, 848, 871,
47
+ 894, 917, 939, 961, 982, 990, 1013, 1016,
48
+ 1017, 1018, 1033, 1039, 1064, 1083, 1104, 1125,
49
+ 1144, 1165, 1187, 1209, 1231, 1253, 1275, 1297,
50
+ 1319, 1341, 1366, 1382, 1401, 1421, 1446, 1473,
51
+ 1499, 1525, 1550, 1563, 1589, 1615, 1641, 1667
52
+ };
53
+
54
+ static const unsigned char _StdTok_trans_keys[] = {
55
+ 33u, 46u, 61u, 64u, 35u, 39u, 42u, 43u,
56
+ 45u, 57u, 63u, 90u, 94u, 126u, 33u, 45u,
57
+ 61u, 63u, 35u, 39u, 42u, 43u, 47u, 57u,
58
+ 65u, 90u, 94u, 126u, 33u, 45u, 61u, 63u,
59
+ 91u, 35u, 39u, 42u, 43u, 47u, 57u, 65u,
60
+ 90u, 94u, 126u, 33u, 45u, 61u, 63u, 35u,
61
+ 39u, 42u, 43u, 47u, 57u, 65u, 90u, 94u,
62
+ 126u, 92u, 93u, 1u, 8u, 11u, 12u, 14u,
63
+ 31u, 33u, 90u, 94u, 127u, 1u, 9u, 11u,
64
+ 12u, 14u, 127u, 34u, 92u, 1u, 8u, 11u,
65
+ 12u, 14u, 31u, 33u, 127u, 64u, 1u, 9u,
66
+ 11u, 12u, 14u, 127u, 33u, 45u, 47u, 61u,
67
+ 63u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
68
+ 90u, 94u, 126u, 33u, 42u, 43u, 46u, 61u,
69
+ 63u, 64u, 35u, 39u, 45u, 47u, 48u, 57u,
70
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
71
+ 33u, 45u, 46u, 47u, 61u, 63u, 64u, 35u,
72
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
73
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
74
+ 47u, 61u, 63u, 64u, 35u, 39u, 42u, 43u,
75
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
76
+ 123u, 126u, 33u, 45u, 46u, 47u, 61u, 63u,
77
+ 64u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
78
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
79
+ 45u, 47u, 61u, 63u, 98u, 99u, 101u, 103u,
80
+ 105u, 109u, 110u, 111u, 35u, 39u, 42u, 43u,
81
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
82
+ 123u, 126u, 33u, 45u, 46u, 47u, 61u, 63u,
83
+ 64u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
84
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 48u,
85
+ 57u, 33u, 45u, 46u, 47u, 61u, 63u, 64u,
86
+ 105u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
87
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
88
+ 45u, 46u, 47u, 61u, 63u, 64u, 111u, 35u,
89
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
90
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
91
+ 47u, 61u, 63u, 64u, 100u, 35u, 39u, 42u,
92
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
93
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 61u,
94
+ 63u, 64u, 111u, 35u, 39u, 42u, 43u, 48u,
95
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
96
+ 126u, 33u, 45u, 46u, 47u, 61u, 63u, 64u,
97
+ 110u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
98
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
99
+ 45u, 46u, 47u, 61u, 63u, 64u, 111u, 35u,
100
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
101
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
102
+ 47u, 61u, 63u, 64u, 105u, 35u, 39u, 42u,
103
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
104
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 61u,
105
+ 63u, 64u, 101u, 35u, 39u, 42u, 43u, 48u,
106
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
107
+ 126u, 33u, 45u, 46u, 47u, 61u, 63u, 64u,
108
+ 114u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
109
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
110
+ 46u, 61u, 63u, 64u, 35u, 39u, 42u, 43u,
111
+ 45u, 47u, 48u, 57u, 65u, 90u, 94u, 96u,
112
+ 97u, 122u, 123u, 126u, 33u, 45u, 47u, 61u,
113
+ 63u, 98u, 99u, 101u, 103u, 105u, 109u, 110u,
114
+ 111u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
115
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 47u,
116
+ 95u, 44u, 58u, 64u, 90u, 97u, 122u, 40u,
117
+ 41u, 33u, 45u, 47u, 61u, 63u, 98u, 99u,
118
+ 101u, 103u, 105u, 109u, 110u, 111u, 35u, 39u,
119
+ 42u, 43u, 48u, 57u, 65u, 90u, 94u, 96u,
120
+ 97u, 122u, 123u, 126u, 47u, 45u, 47u, 58u,
121
+ 64u, 95u, 44u, 46u, 48u, 57u, 65u, 90u,
122
+ 97u, 122u, 0u, 33u, 34u, 42u, 43u, 45u,
123
+ 47u, 58u, 61u, 63u, 64u, 102u, 104u, 35u,
124
+ 39u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
125
+ 122u, 123u, 126u, 33u, 46u, 61u, 64u, 35u,
126
+ 39u, 42u, 43u, 45u, 57u, 63u, 90u, 94u,
127
+ 126u, 33u, 46u, 61u, 63u, 35u, 39u, 42u,
128
+ 43u, 45u, 57u, 65u, 90u, 94u, 126u, 34u,
129
+ 92u, 1u, 8u, 11u, 12u, 14u, 31u, 33u,
130
+ 127u, 33u, 46u, 61u, 64u, 35u, 39u, 42u,
131
+ 43u, 45u, 57u, 63u, 90u, 94u, 126u, 33u,
132
+ 46u, 61u, 64u, 35u, 39u, 42u, 43u, 45u,
133
+ 47u, 48u, 57u, 63u, 90u, 94u, 126u, 33u,
134
+ 46u, 61u, 64u, 35u, 39u, 42u, 43u, 45u,
135
+ 47u, 48u, 57u, 63u, 90u, 94u, 126u, 33u,
136
+ 46u, 61u, 64u, 35u, 39u, 42u, 43u, 45u,
137
+ 47u, 48u, 57u, 63u, 90u, 94u, 126u, 33u,
138
+ 42u, 43u, 45u, 46u, 47u, 58u, 61u, 63u,
139
+ 64u, 95u, 35u, 39u, 48u, 57u, 65u, 90u,
140
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 46u,
141
+ 61u, 63u, 64u, 35u, 39u, 42u, 43u, 45u,
142
+ 47u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
143
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 58u,
144
+ 61u, 63u, 64u, 35u, 39u, 42u, 43u, 48u,
145
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
146
+ 126u, 47u, 48u, 57u, 33u, 45u, 46u, 47u,
147
+ 58u, 61u, 63u, 64u, 122u, 35u, 39u, 42u,
148
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
149
+ 121u, 123u, 126u, 33u, 45u, 46u, 47u, 58u,
150
+ 61u, 63u, 64u, 109u, 35u, 39u, 42u, 43u,
151
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
152
+ 123u, 126u, 33u, 45u, 46u, 47u, 58u, 61u,
153
+ 63u, 64u, 117u, 35u, 39u, 42u, 43u, 48u,
154
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
155
+ 126u, 33u, 45u, 46u, 47u, 58u, 61u, 63u,
156
+ 64u, 118u, 35u, 39u, 42u, 43u, 48u, 57u,
157
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
158
+ 33u, 45u, 46u, 47u, 58u, 61u, 63u, 64u,
159
+ 102u, 116u, 35u, 39u, 42u, 43u, 48u, 57u,
160
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
161
+ 33u, 45u, 46u, 47u, 58u, 61u, 63u, 64u,
162
+ 108u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
163
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
164
+ 45u, 46u, 47u, 58u, 61u, 63u, 64u, 116u,
165
+ 35u, 39u, 42u, 43u, 48u, 57u, 65u, 90u,
166
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 45u,
167
+ 46u, 47u, 58u, 61u, 63u, 64u, 103u, 35u,
168
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
169
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
170
+ 47u, 61u, 63u, 64u, 95u, 35u, 39u, 42u,
171
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
172
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 61u,
173
+ 63u, 64u, 95u, 35u, 39u, 42u, 43u, 48u,
174
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
175
+ 126u, 33u, 45u, 46u, 47u, 61u, 63u, 64u,
176
+ 35u, 39u, 42u, 43u, 48u, 57u, 65u, 90u,
177
+ 94u, 96u, 97u, 122u, 123u, 126u, 47u, 95u,
178
+ 44u, 58u, 64u, 90u, 97u, 122u, 33u, 42u,
179
+ 43u, 45u, 46u, 47u, 58u, 61u, 63u, 64u,
180
+ 95u, 35u, 39u, 48u, 57u, 65u, 90u, 94u,
181
+ 96u, 97u, 122u, 123u, 126u, 40u, 41u, 45u,
182
+ 40u, 41u, 33u, 46u, 61u, 63u, 64u, 35u,
183
+ 39u, 42u, 43u, 45u, 57u, 65u, 90u, 94u,
184
+ 126u, 48u, 57u, 65u, 90u, 97u, 122u, 33u,
185
+ 38u, 39u, 42u, 43u, 45u, 46u, 47u, 58u,
186
+ 61u, 63u, 64u, 95u, 35u, 37u, 48u, 57u,
187
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
188
+ 33u, 46u, 61u, 63u, 64u, 35u, 39u, 42u,
189
+ 43u, 45u, 57u, 65u, 90u, 94u, 96u, 97u,
190
+ 122u, 123u, 126u, 33u, 46u, 61u, 63u, 64u,
191
+ 35u, 39u, 42u, 43u, 45u, 47u, 48u, 57u,
192
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
193
+ 33u, 46u, 61u, 63u, 64u, 83u, 115u, 35u,
194
+ 39u, 42u, 43u, 45u, 57u, 65u, 90u, 94u,
195
+ 96u, 97u, 122u, 123u, 126u, 33u, 46u, 61u,
196
+ 63u, 64u, 35u, 39u, 42u, 43u, 45u, 57u,
197
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
198
+ 33u, 45u, 46u, 47u, 61u, 63u, 64u, 35u,
199
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
200
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
201
+ 47u, 61u, 63u, 64u, 105u, 35u, 39u, 42u,
202
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
203
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 61u,
204
+ 63u, 64u, 111u, 35u, 39u, 42u, 43u, 48u,
205
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
206
+ 126u, 33u, 45u, 46u, 47u, 61u, 63u, 64u,
207
+ 100u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
208
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
209
+ 45u, 46u, 47u, 61u, 63u, 64u, 111u, 35u,
210
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
211
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
212
+ 47u, 61u, 63u, 64u, 110u, 35u, 39u, 42u,
213
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
214
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 61u,
215
+ 63u, 64u, 105u, 35u, 39u, 42u, 43u, 48u,
216
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
217
+ 126u, 33u, 45u, 46u, 47u, 61u, 63u, 64u,
218
+ 101u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
219
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
220
+ 45u, 46u, 47u, 61u, 63u, 64u, 114u, 35u,
221
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
222
+ 96u, 97u, 122u, 123u, 126u, 33u, 38u, 39u,
223
+ 42u, 43u, 45u, 46u, 47u, 58u, 61u, 63u,
224
+ 64u, 95u, 35u, 37u, 48u, 57u, 65u, 90u,
225
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 46u,
226
+ 61u, 64u, 83u, 115u, 35u, 39u, 42u, 43u,
227
+ 45u, 57u, 63u, 90u, 94u, 126u, 33u, 45u,
228
+ 61u, 63u, 91u, 35u, 39u, 42u, 43u, 47u,
229
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
230
+ 126u, 33u, 46u, 61u, 63u, 35u, 39u, 42u,
231
+ 43u, 45u, 47u, 48u, 57u, 65u, 90u, 94u,
232
+ 96u, 97u, 122u, 123u, 126u, 33u, 38u, 39u,
233
+ 42u, 43u, 45u, 46u, 47u, 58u, 61u, 63u,
234
+ 64u, 95u, 35u, 37u, 48u, 57u, 65u, 90u,
235
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 38u,
236
+ 39u, 42u, 43u, 45u, 46u, 47u, 58u, 61u,
237
+ 63u, 64u, 95u, 105u, 116u, 35u, 37u, 48u,
238
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
239
+ 126u, 33u, 38u, 39u, 42u, 43u, 45u, 46u,
240
+ 47u, 58u, 61u, 63u, 64u, 95u, 108u, 35u,
241
+ 37u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
242
+ 122u, 123u, 126u, 33u, 38u, 39u, 42u, 43u,
243
+ 45u, 46u, 47u, 58u, 61u, 63u, 64u, 95u,
244
+ 101u, 35u, 37u, 48u, 57u, 65u, 90u, 94u,
245
+ 96u, 97u, 122u, 123u, 126u, 33u, 38u, 39u,
246
+ 42u, 43u, 45u, 46u, 47u, 58u, 61u, 63u,
247
+ 64u, 95u, 35u, 37u, 48u, 57u, 65u, 90u,
248
+ 94u, 96u, 97u, 122u, 123u, 126u, 45u, 47u,
249
+ 58u, 64u, 95u, 44u, 46u, 48u, 57u, 65u,
250
+ 90u, 97u, 122u, 33u, 38u, 39u, 42u, 43u,
251
+ 45u, 46u, 47u, 58u, 61u, 63u, 64u, 95u,
252
+ 112u, 35u, 37u, 48u, 57u, 65u, 90u, 94u,
253
+ 96u, 97u, 122u, 123u, 126u, 33u, 38u, 39u,
254
+ 42u, 43u, 45u, 46u, 47u, 58u, 61u, 63u,
255
+ 64u, 95u, 116u, 35u, 37u, 48u, 57u, 65u,
256
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
257
+ 38u, 39u, 42u, 43u, 45u, 46u, 47u, 58u,
258
+ 61u, 63u, 64u, 95u, 116u, 35u, 37u, 48u,
259
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
260
+ 126u, 33u, 38u, 39u, 42u, 43u, 45u, 46u,
261
+ 47u, 58u, 61u, 63u, 64u, 95u, 112u, 35u,
262
+ 37u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
263
+ 122u, 123u, 126u, 33u, 38u, 39u, 42u, 43u,
264
+ 45u, 46u, 47u, 58u, 61u, 63u, 64u, 95u,
265
+ 115u, 35u, 37u, 48u, 57u, 65u, 90u, 94u,
266
+ 96u, 97u, 122u, 123u, 126u, 0
267
+ };
268
+
269
+ static const char _StdTok_single_lengths[] = {
270
+ 0, 0, 4, 4, 5, 4, 2, 0,
271
+ 2, 1, 0, 5, 7, 7, 7, 7,
272
+ 13, 7, 0, 8, 8, 8, 8, 8,
273
+ 8, 8, 8, 8, 5, 13, 1, 1,
274
+ 2, 13, 1, 5, 0, 13, 4, 4,
275
+ 2, 4, 4, 4, 4, 11, 5, 8,
276
+ 1, 9, 9, 9, 9, 10, 9, 9,
277
+ 9, 8, 8, 7, 2, 11, 3, 1,
278
+ 1, 5, 0, 13, 5, 5, 7, 5,
279
+ 7, 8, 8, 8, 8, 8, 8, 8,
280
+ 8, 13, 6, 5, 4, 13, 15, 14,
281
+ 14, 13, 5, 14, 14, 14, 14, 14
282
+ };
283
+
284
+ static const char _StdTok_range_lengths[] = {
285
+ 0, 0, 5, 5, 5, 5, 5, 3,
286
+ 4, 0, 3, 5, 7, 7, 7, 7,
287
+ 7, 7, 1, 7, 7, 7, 7, 7,
288
+ 7, 7, 7, 7, 8, 7, 0, 3,
289
+ 0, 7, 0, 4, 0, 6, 5, 5,
290
+ 4, 5, 6, 6, 6, 6, 8, 7,
291
+ 1, 7, 7, 7, 7, 7, 7, 7,
292
+ 7, 7, 7, 7, 3, 6, 0, 0,
293
+ 0, 5, 3, 6, 7, 8, 7, 7,
294
+ 7, 7, 7, 7, 7, 7, 7, 7,
295
+ 7, 6, 5, 7, 8, 6, 6, 6,
296
+ 6, 6, 4, 6, 6, 6, 6, 6
297
+ };
298
+
299
+ static const short _StdTok_index_offsets[] = {
300
+ 0, 0, 1, 11, 21, 32, 42, 50,
301
+ 54, 61, 63, 67, 78, 93, 108, 123,
302
+ 138, 159, 174, 176, 192, 208, 224, 240,
303
+ 256, 272, 288, 304, 320, 334, 355, 357,
304
+ 362, 365, 386, 388, 398, 399, 419, 429,
305
+ 439, 446, 456, 467, 478, 489, 507, 521,
306
+ 537, 540, 557, 574, 591, 608, 626, 643,
307
+ 660, 677, 693, 709, 724, 730, 748, 752,
308
+ 754, 756, 767, 771, 791, 804, 818, 833,
309
+ 846, 861, 877, 893, 909, 925, 941, 957,
310
+ 973, 989, 1009, 1021, 1034, 1047, 1067, 1089,
311
+ 1110, 1131, 1151, 1161, 1182, 1203, 1224, 1245
312
+ };
313
+
314
+ static const char _StdTok_indicies[] = {
315
+ 0, 2, 3, 2, 4, 2, 2, 2,
316
+ 2, 2, 1, 2, 2, 2, 2, 2,
317
+ 2, 2, 2, 2, 1, 5, 5, 5,
318
+ 5, 6, 5, 5, 5, 5, 5, 1,
319
+ 5, 5, 5, 5, 5, 5, 5, 5,
320
+ 5, 7, 8, 9, 6, 6, 6, 6,
321
+ 6, 1, 6, 6, 6, 1, 12, 13,
322
+ 11, 11, 11, 11, 10, 4, 10, 11,
323
+ 11, 11, 10, 2, 2, 2, 2, 2,
324
+ 2, 2, 15, 2, 2, 14, 2, 16,
325
+ 2, 3, 2, 2, 4, 2, 2, 17,
326
+ 17, 2, 17, 2, 1, 2, 18, 3,
327
+ 2, 2, 2, 4, 2, 2, 19, 19,
328
+ 2, 19, 2, 1, 2, 18, 3, 2,
329
+ 2, 2, 4, 2, 2, 20, 20, 2,
330
+ 20, 2, 1, 2, 18, 21, 2, 2,
331
+ 2, 4, 2, 2, 20, 20, 2, 20,
332
+ 2, 1, 2, 2, 2, 2, 2, 23,
333
+ 24, 25, 26, 27, 28, 29, 30, 2,
334
+ 2, 20, 22, 2, 22, 2, 1, 2,
335
+ 18, 21, 2, 2, 2, 4, 2, 2,
336
+ 20, 31, 2, 31, 2, 1, 33, 32,
337
+ 2, 18, 21, 2, 2, 2, 4, 34,
338
+ 2, 2, 20, 31, 2, 31, 2, 1,
339
+ 2, 18, 21, 2, 2, 2, 4, 35,
340
+ 2, 2, 20, 31, 2, 31, 2, 1,
341
+ 2, 18, 21, 2, 2, 2, 4, 36,
342
+ 2, 2, 20, 31, 2, 31, 2, 1,
343
+ 2, 18, 21, 2, 2, 2, 4, 37,
344
+ 2, 2, 20, 31, 2, 31, 2, 1,
345
+ 2, 18, 21, 2, 2, 2, 4, 38,
346
+ 2, 2, 20, 31, 2, 31, 2, 1,
347
+ 2, 18, 21, 2, 2, 2, 4, 31,
348
+ 2, 2, 20, 20, 2, 20, 2, 32,
349
+ 2, 18, 21, 2, 2, 2, 4, 39,
350
+ 2, 2, 20, 31, 2, 31, 2, 1,
351
+ 2, 18, 21, 2, 2, 2, 4, 40,
352
+ 2, 2, 20, 31, 2, 31, 2, 1,
353
+ 2, 18, 21, 2, 2, 2, 4, 41,
354
+ 2, 2, 20, 31, 2, 31, 2, 1,
355
+ 2, 3, 2, 2, 4, 2, 2, 2,
356
+ 42, 42, 2, 42, 2, 1, 2, 2,
357
+ 2, 2, 2, 23, 24, 25, 26, 27,
358
+ 28, 29, 30, 2, 2, 44, 22, 2,
359
+ 22, 2, 43, 45, 1, 46, 46, 46,
360
+ 46, 1, 47, 48, 10, 2, 2, 2,
361
+ 2, 2, 50, 51, 52, 53, 54, 55,
362
+ 56, 57, 2, 2, 20, 49, 2, 49,
363
+ 2, 1, 59, 58, 60, 61, 46, 46,
364
+ 60, 46, 60, 60, 60, 58, 62, 63,
365
+ 65, 66, 67, 68, 68, 67, 70, 67,
366
+ 71, 72, 74, 75, 67, 69, 73, 67,
367
+ 73, 67, 64, 65, 3, 2, 4, 2,
368
+ 2, 2, 2, 2, 76, 5, 78, 5,
369
+ 5, 5, 5, 5, 5, 5, 77, 12,
370
+ 13, 11, 11, 11, 11, 79, 2, 3,
371
+ 2, 4, 2, 2, 2, 2, 2, 1,
372
+ 2, 3, 2, 4, 2, 2, 2, 80,
373
+ 2, 2, 79, 2, 82, 2, 4, 2,
374
+ 2, 2, 80, 2, 2, 81, 2, 3,
375
+ 2, 4, 2, 2, 2, 15, 2, 2,
376
+ 83, 2, 16, 2, 85, 86, 2, 87,
377
+ 2, 2, 4, 89, 2, 69, 88, 2,
378
+ 88, 2, 84, 2, 3, 2, 2, 4,
379
+ 2, 2, 2, 17, 17, 2, 17, 2,
380
+ 90, 2, 18, 21, 92, 93, 2, 2,
381
+ 4, 2, 2, 20, 20, 2, 20, 2,
382
+ 91, 94, 33, 91, 2, 18, 21, 92,
383
+ 93, 2, 2, 4, 31, 2, 2, 20,
384
+ 20, 2, 20, 2, 91, 2, 18, 21,
385
+ 92, 93, 2, 2, 4, 31, 2, 2,
386
+ 20, 20, 2, 20, 2, 91, 2, 18,
387
+ 21, 92, 93, 2, 2, 4, 31, 2,
388
+ 2, 20, 20, 2, 20, 2, 91, 2,
389
+ 18, 21, 92, 93, 2, 2, 4, 31,
390
+ 2, 2, 20, 20, 2, 20, 2, 91,
391
+ 2, 18, 21, 92, 93, 2, 2, 4,
392
+ 95, 31, 2, 2, 20, 20, 2, 20,
393
+ 2, 91, 2, 18, 21, 92, 93, 2,
394
+ 2, 4, 31, 2, 2, 20, 20, 2,
395
+ 20, 2, 91, 2, 18, 21, 92, 93,
396
+ 2, 2, 4, 31, 2, 2, 20, 20,
397
+ 2, 20, 2, 91, 2, 18, 21, 92,
398
+ 93, 2, 2, 4, 31, 2, 2, 20,
399
+ 20, 2, 20, 2, 91, 2, 85, 21,
400
+ 2, 2, 2, 4, 89, 2, 2, 19,
401
+ 19, 2, 19, 2, 84, 2, 89, 3,
402
+ 2, 2, 2, 4, 89, 2, 2, 42,
403
+ 42, 2, 42, 2, 84, 2, 18, 21,
404
+ 2, 2, 2, 4, 2, 2, 44, 20,
405
+ 2, 20, 2, 83, 96, 46, 46, 46,
406
+ 46, 91, 2, 16, 2, 85, 21, 2,
407
+ 87, 2, 2, 4, 89, 2, 88, 88,
408
+ 2, 88, 2, 84, 47, 48, 97, 79,
409
+ 47, 98, 48, 99, 2, 3, 2, 71,
410
+ 4, 2, 2, 2, 2, 2, 100, 101,
411
+ 101, 101, 1, 2, 103, 104, 16, 2,
412
+ 85, 105, 2, 87, 2, 2, 107, 89,
413
+ 2, 106, 108, 2, 108, 2, 102, 2,
414
+ 3, 2, 2, 4, 2, 2, 2, 110,
415
+ 2, 110, 2, 109, 2, 3, 2, 2,
416
+ 4, 2, 2, 2, 110, 110, 2, 110,
417
+ 2, 109, 2, 3, 2, 2, 4, 113,
418
+ 113, 2, 2, 2, 112, 2, 112, 2,
419
+ 111, 2, 3, 2, 2, 4, 2, 2,
420
+ 2, 112, 2, 112, 2, 1, 2, 18,
421
+ 105, 2, 2, 2, 4, 2, 2, 20,
422
+ 31, 2, 31, 2, 114, 2, 18, 105,
423
+ 2, 2, 2, 4, 34, 2, 2, 20,
424
+ 31, 2, 31, 2, 114, 2, 18, 105,
425
+ 2, 2, 2, 4, 35, 2, 2, 20,
426
+ 31, 2, 31, 2, 114, 2, 18, 105,
427
+ 2, 2, 2, 4, 36, 2, 2, 20,
428
+ 31, 2, 31, 2, 114, 2, 18, 105,
429
+ 2, 2, 2, 4, 37, 2, 2, 20,
430
+ 31, 2, 31, 2, 114, 2, 18, 105,
431
+ 2, 2, 2, 4, 38, 2, 2, 20,
432
+ 31, 2, 31, 2, 114, 2, 18, 105,
433
+ 2, 2, 2, 4, 39, 2, 2, 20,
434
+ 31, 2, 31, 2, 114, 2, 18, 105,
435
+ 2, 2, 2, 4, 40, 2, 2, 20,
436
+ 31, 2, 31, 2, 114, 2, 18, 105,
437
+ 2, 2, 2, 4, 41, 2, 2, 20,
438
+ 31, 2, 31, 2, 114, 2, 103, 115,
439
+ 16, 2, 85, 21, 2, 87, 2, 2,
440
+ 107, 89, 2, 106, 106, 2, 106, 2,
441
+ 102, 2, 3, 2, 4, 116, 116, 2,
442
+ 2, 2, 2, 2, 111, 5, 5, 5,
443
+ 5, 6, 5, 5, 5, 117, 5, 117,
444
+ 5, 109, 5, 78, 5, 5, 5, 5,
445
+ 5, 117, 117, 5, 117, 5, 77, 2,
446
+ 103, 104, 16, 2, 85, 21, 2, 87,
447
+ 2, 2, 107, 89, 2, 106, 108, 2,
448
+ 108, 2, 102, 2, 103, 104, 16, 2,
449
+ 85, 105, 2, 87, 2, 2, 107, 89,
450
+ 118, 119, 2, 106, 108, 2, 108, 2,
451
+ 102, 2, 103, 104, 16, 2, 85, 21,
452
+ 2, 87, 2, 2, 107, 89, 120, 2,
453
+ 106, 108, 2, 108, 2, 102, 2, 103,
454
+ 104, 16, 2, 85, 21, 2, 87, 2,
455
+ 2, 107, 89, 121, 2, 106, 108, 2,
456
+ 108, 2, 102, 2, 103, 104, 16, 2,
457
+ 85, 21, 2, 122, 2, 2, 107, 89,
458
+ 2, 106, 108, 2, 108, 2, 102, 60,
459
+ 123, 46, 46, 60, 46, 60, 60, 60,
460
+ 91, 2, 103, 104, 16, 2, 85, 21,
461
+ 2, 87, 2, 2, 107, 89, 121, 2,
462
+ 106, 108, 2, 108, 2, 102, 2, 103,
463
+ 104, 16, 2, 85, 105, 2, 87, 2,
464
+ 2, 107, 89, 124, 2, 106, 108, 2,
465
+ 108, 2, 102, 2, 103, 104, 16, 2,
466
+ 85, 21, 2, 87, 2, 2, 107, 89,
467
+ 125, 2, 106, 108, 2, 108, 2, 102,
468
+ 2, 103, 104, 16, 2, 85, 21, 2,
469
+ 87, 2, 2, 107, 89, 126, 2, 106,
470
+ 108, 2, 108, 2, 102, 2, 103, 104,
471
+ 16, 2, 85, 21, 2, 122, 2, 2,
472
+ 107, 89, 121, 2, 106, 108, 2, 108,
473
+ 2, 102, 0
474
+ };
475
+
476
+ static const char _StdTok_trans_targs[] = {
477
+ 36, 37, 2, 3, 4, 39, 6, 37,
478
+ 7, 37, 37, 8, 9, 10, 37, 44,
479
+ 12, 46, 14, 57, 15, 16, 17, 19,
480
+ 20, 21, 22, 23, 25, 26, 27, 47,
481
+ 37, 48, 49, 50, 51, 52, 53, 54,
482
+ 55, 56, 58, 37, 59, 31, 60, 63,
483
+ 64, 72, 73, 74, 75, 76, 77, 78,
484
+ 79, 80, 37, 35, 60, 90, 0, 37,
485
+ 37, 38, 40, 41, 42, 45, 62, 65,
486
+ 66, 67, 86, 92, 37, 37, 5, 37,
487
+ 43, 37, 11, 37, 37, 13, 29, 30,
488
+ 61, 28, 37, 37, 41, 18, 37, 24,
489
+ 60, 32, 37, 37, 37, 66, 37, 68,
490
+ 70, 33, 81, 83, 85, 37, 69, 37,
491
+ 71, 71, 37, 82, 41, 84, 87, 91,
492
+ 88, 89, 34, 90, 93, 94, 95
493
+ };
494
+
495
+ static const char _StdTok_trans_actions[] = {
496
+ 5, 61, 0, 0, 0, 11, 0, 49,
497
+ 0, 13, 59, 0, 0, 0, 57, 105,
498
+ 0, 75, 0, 90, 0, 0, 0, 0,
499
+ 0, 0, 0, 0, 0, 0, 0, 96,
500
+ 55, 0, 96, 96, 96, 96, 96, 96,
501
+ 96, 96, 90, 53, 105, 0, 0, 0,
502
+ 0, 99, 99, 99, 99, 99, 99, 99,
503
+ 99, 99, 51, 0, 1, 0, 0, 15,
504
+ 17, 69, 108, 108, 108, 90, 11, 66,
505
+ 108, 78, 78, 78, 21, 29, 0, 47,
506
+ 102, 43, 0, 45, 35, 0, 0, 0,
507
+ 90, 0, 27, 39, 111, 0, 63, 0,
508
+ 3, 0, 25, 23, 19, 72, 31, 93,
509
+ 81, 0, 78, 93, 78, 37, 93, 33,
510
+ 87, 84, 41, 81, 84, 11, 78, 78,
511
+ 78, 78, 0, 3, 78, 78, 78
512
+ };
513
+
514
+ static const char _StdTok_to_state_actions[] = {
515
+ 0, 7, 0, 0, 0, 0, 0, 0,
516
+ 0, 0, 0, 0, 0, 0, 0, 0,
517
+ 0, 0, 0, 0, 0, 0, 0, 0,
518
+ 0, 0, 0, 0, 0, 0, 0, 0,
519
+ 0, 0, 0, 0, 7, 7, 0, 0,
520
+ 0, 0, 0, 0, 0, 0, 0, 0,
521
+ 0, 0, 0, 0, 0, 0, 0, 0,
522
+ 0, 0, 0, 0, 0, 0, 0, 0,
523
+ 0, 0, 0, 0, 0, 0, 0, 0,
524
+ 0, 0, 0, 0, 0, 0, 0, 0,
525
+ 0, 0, 0, 0, 0, 0, 0, 0,
526
+ 0, 0, 0, 0, 0, 0, 0, 0
527
+ };
528
+
529
+ static const char _StdTok_from_state_actions[] = {
530
+ 0, 0, 0, 0, 0, 0, 0, 0,
531
+ 0, 0, 0, 0, 0, 0, 0, 0,
532
+ 0, 0, 0, 0, 0, 0, 0, 0,
533
+ 0, 0, 0, 0, 0, 0, 0, 0,
534
+ 0, 0, 0, 0, 0, 9, 0, 0,
535
+ 0, 0, 0, 0, 0, 0, 0, 0,
536
+ 0, 0, 0, 0, 0, 0, 0, 0,
537
+ 0, 0, 0, 0, 0, 0, 0, 0,
538
+ 0, 0, 0, 0, 0, 0, 0, 0,
539
+ 0, 0, 0, 0, 0, 0, 0, 0,
540
+ 0, 0, 0, 0, 0, 0, 0, 0,
541
+ 0, 0, 0, 0, 0, 0, 0, 0
542
+ };
543
+
544
+ static const short _StdTok_eof_trans[] = {
545
+ 0, 0, 2, 2, 2, 8, 2, 2,
546
+ 11, 11, 11, 15, 2, 2, 2, 2,
547
+ 2, 2, 33, 2, 2, 2, 2, 2,
548
+ 33, 2, 2, 2, 2, 44, 2, 2,
549
+ 11, 2, 59, 59, 0, 0, 77, 78,
550
+ 80, 2, 80, 82, 84, 85, 91, 92,
551
+ 92, 92, 92, 92, 92, 92, 92, 92,
552
+ 92, 85, 85, 84, 92, 85, 80, 99,
553
+ 100, 101, 2, 103, 110, 110, 112, 2,
554
+ 115, 115, 115, 115, 115, 115, 115, 115,
555
+ 115, 103, 112, 110, 78, 103, 103, 103,
556
+ 103, 103, 92, 103, 103, 103, 103, 103
557
+ };
558
+
559
+ static const int StdTok_start = 1;
560
+ static const int StdTok_error = 0;
561
+
562
+ static const int StdTok_en_frt_tokenizer = 37;
563
+ static const int StdTok_en_main = 1;
564
+
565
+ #line 29 "scanner.rl"
566
+
567
+ void frt_std_scan(const char *in,
568
+ char *out, size_t out_size,
569
+ const char **start,
570
+ const char **end,
571
+ int *token_size)
572
+ {
573
+ int cs, act, top;
574
+ int stack[32];
575
+ char *ts = 0, *te = 0;
576
+
577
+
578
+ #line 579 "scanner.c"
579
+ {
580
+ cs = StdTok_start;
581
+ top = 0;
582
+ ts = 0;
583
+ te = 0;
584
+ act = 0;
585
+ }
586
+ #line 41 "scanner.rl"
587
+
588
+ char *p = (char *)in, *pe = 0, *eof = pe;
589
+ int skip = 0;
590
+ int trunc = 0;
591
+ char strip_char = 0;
592
+
593
+ *end = 0;
594
+ *start = 0;
595
+ *token_size = 0;
596
+
597
+
598
+ #line 599 "scanner.c"
599
+ {
600
+ int _klen;
601
+ unsigned int _trans;
602
+ const char *_acts;
603
+ unsigned int _nacts;
604
+ const unsigned char *_keys;
605
+
606
+ if ( p == pe )
607
+ goto _test_eof;
608
+ if ( cs == 0 )
609
+ goto _out;
610
+ _resume:
611
+ _acts = _StdTok_actions + _StdTok_from_state_actions[cs];
612
+ _nacts = (unsigned int) *_acts++;
613
+ while ( _nacts-- > 0 ) {
614
+ switch ( *_acts++ ) {
615
+ case 4:
616
+ #line 1 "scanner.rl"
617
+ {ts = p;}
618
+ break;
619
+ #line 620 "scanner.c"
620
+ }
621
+ }
622
+
623
+ _keys = _StdTok_trans_keys + _StdTok_key_offsets[cs];
624
+ _trans = _StdTok_index_offsets[cs];
625
+
626
+ _klen = _StdTok_single_lengths[cs];
627
+ if ( _klen > 0 ) {
628
+ const unsigned char *_lower = _keys;
629
+ const unsigned char *_mid;
630
+ const unsigned char *_upper = _keys + _klen - 1;
631
+ while (1) {
632
+ if ( _upper < _lower )
633
+ break;
634
+
635
+ _mid = _lower + ((_upper-_lower) >> 1);
636
+ if ( (*p) < *_mid )
637
+ _upper = _mid - 1;
638
+ else if ( (*p) > *_mid )
639
+ _lower = _mid + 1;
640
+ else {
641
+ _trans += (_mid - _keys);
642
+ goto _match;
643
+ }
644
+ }
645
+ _keys += _klen;
646
+ _trans += _klen;
647
+ }
648
+
649
+ _klen = _StdTok_range_lengths[cs];
650
+ if ( _klen > 0 ) {
651
+ const unsigned char *_lower = _keys;
652
+ const unsigned char *_mid;
653
+ const unsigned char *_upper = _keys + (_klen<<1) - 2;
654
+ while (1) {
655
+ if ( _upper < _lower )
656
+ break;
657
+
658
+ _mid = _lower + (((_upper-_lower) >> 1) & ~1);
659
+ if ( (*p) < _mid[0] )
660
+ _upper = _mid - 2;
661
+ else if ( (*p) > _mid[1] )
662
+ _lower = _mid + 2;
663
+ else {
664
+ _trans += ((_mid - _keys)>>1);
665
+ goto _match;
666
+ }
667
+ }
668
+ _trans += _klen;
669
+ }
670
+
671
+ _match:
672
+ _trans = _StdTok_indicies[_trans];
673
+ _eof_trans:
674
+ cs = _StdTok_trans_targs[_trans];
675
+
676
+ if ( _StdTok_trans_actions[_trans] == 0 )
677
+ goto _again;
678
+
679
+ _acts = _StdTok_actions + _StdTok_trans_actions[_trans];
680
+ _nacts = (unsigned int) *_acts++;
681
+ while ( _nacts-- > 0 )
682
+ {
683
+ switch ( *_acts++ )
684
+ {
685
+ case 0:
686
+ #line 14 "scanner.rl"
687
+ { skip = p - ts; }
688
+ break;
689
+ case 1:
690
+ #line 26 "scanner.rl"
691
+ { trunc = 1; }
692
+ break;
693
+ case 2:
694
+ #line 25 "scanner.rl"
695
+ { p--; {stack[top++] = cs; cs = 37; goto _again;} }
696
+ break;
697
+ case 5:
698
+ #line 1 "scanner.rl"
699
+ {te = p+1;}
700
+ break;
701
+ case 6:
702
+ #line 12 "scanner.rl"
703
+ {act = 1;}
704
+ break;
705
+ case 7:
706
+ #line 14 "scanner.rl"
707
+ {act = 2;}
708
+ break;
709
+ case 8:
710
+ #line 16 "scanner.rl"
711
+ {act = 3;}
712
+ break;
713
+ case 9:
714
+ #line 22 "scanner.rl"
715
+ {act = 6;}
716
+ break;
717
+ case 10:
718
+ #line 28 "scanner.rl"
719
+ {act = 8;}
720
+ break;
721
+ case 11:
722
+ #line 29 "scanner.rl"
723
+ {act = 9;}
724
+ break;
725
+ case 12:
726
+ #line 30 "scanner.rl"
727
+ {act = 10;}
728
+ break;
729
+ case 13:
730
+ #line 33 "scanner.rl"
731
+ {act = 11;}
732
+ break;
733
+ case 14:
734
+ #line 36 "scanner.rl"
735
+ {act = 12;}
736
+ break;
737
+ case 15:
738
+ #line 39 "scanner.rl"
739
+ {act = 13;}
740
+ break;
741
+ case 16:
742
+ #line 42 "scanner.rl"
743
+ {act = 14;}
744
+ break;
745
+ case 17:
746
+ #line 45 "scanner.rl"
747
+ {act = 15;}
748
+ break;
749
+ case 18:
750
+ #line 48 "scanner.rl"
751
+ {act = 16;}
752
+ break;
753
+ case 19:
754
+ #line 49 "scanner.rl"
755
+ {act = 17;}
756
+ break;
757
+ case 20:
758
+ #line 53 "scanner.rl"
759
+ {act = 19;}
760
+ break;
761
+ case 21:
762
+ #line 25 "scanner.rl"
763
+ {te = p+1;{ RET; }}
764
+ break;
765
+ case 22:
766
+ #line 42 "scanner.rl"
767
+ {te = p+1;{ RET; }}
768
+ break;
769
+ case 23:
770
+ #line 52 "scanner.rl"
771
+ {te = p+1;{ return; }}
772
+ break;
773
+ case 24:
774
+ #line 53 "scanner.rl"
775
+ {te = p+1;}
776
+ break;
777
+ case 25:
778
+ #line 12 "scanner.rl"
779
+ {te = p;p--;{ RET; }}
780
+ break;
781
+ case 26:
782
+ #line 14 "scanner.rl"
783
+ {te = p;p--;{ RET; }}
784
+ break;
785
+ case 27:
786
+ #line 18 "scanner.rl"
787
+ {te = p;p--;{ RET; }}
788
+ break;
789
+ case 28:
790
+ #line 20 "scanner.rl"
791
+ {te = p;p--;{ RET; }}
792
+ break;
793
+ case 29:
794
+ #line 22 "scanner.rl"
795
+ {te = p;p--;{ RET; }}
796
+ break;
797
+ case 30:
798
+ #line 25 "scanner.rl"
799
+ {te = p;p--;{ RET; }}
800
+ break;
801
+ case 31:
802
+ #line 28 "scanner.rl"
803
+ {te = p;p--;{ RET; }}
804
+ break;
805
+ case 32:
806
+ #line 29 "scanner.rl"
807
+ {te = p;p--;{ trunc = 1; RET; }}
808
+ break;
809
+ case 33:
810
+ #line 36 "scanner.rl"
811
+ {te = p;p--;{ RET; }}
812
+ break;
813
+ case 34:
814
+ #line 39 "scanner.rl"
815
+ {te = p;p--;{ RET; }}
816
+ break;
817
+ case 35:
818
+ #line 42 "scanner.rl"
819
+ {te = p;p--;{ RET; }}
820
+ break;
821
+ case 36:
822
+ #line 45 "scanner.rl"
823
+ {te = p;p--;{ STRIP('.'); }}
824
+ break;
825
+ case 37:
826
+ #line 48 "scanner.rl"
827
+ {te = p;p--;{ RET; }}
828
+ break;
829
+ case 38:
830
+ #line 49 "scanner.rl"
831
+ {te = p;p--;{ RET; }}
832
+ break;
833
+ case 39:
834
+ #line 53 "scanner.rl"
835
+ {te = p;p--;}
836
+ break;
837
+ case 40:
838
+ #line 25 "scanner.rl"
839
+ {{p = ((te))-1;}{ RET; }}
840
+ break;
841
+ case 41:
842
+ #line 28 "scanner.rl"
843
+ {{p = ((te))-1;}{ RET; }}
844
+ break;
845
+ case 42:
846
+ #line 36 "scanner.rl"
847
+ {{p = ((te))-1;}{ RET; }}
848
+ break;
849
+ case 43:
850
+ #line 42 "scanner.rl"
851
+ {{p = ((te))-1;}{ RET; }}
852
+ break;
853
+ case 44:
854
+ #line 48 "scanner.rl"
855
+ {{p = ((te))-1;}{ RET; }}
856
+ break;
857
+ case 45:
858
+ #line 53 "scanner.rl"
859
+ {{p = ((te))-1;}}
860
+ break;
861
+ case 46:
862
+ #line 1 "scanner.rl"
863
+ { switch( act ) {
864
+ case 1:
865
+ {{p = ((te))-1;} RET; }
866
+ break;
867
+ case 2:
868
+ {{p = ((te))-1;} RET; }
869
+ break;
870
+ case 3:
871
+ {{p = ((te))-1;} RET; }
872
+ break;
873
+ case 6:
874
+ {{p = ((te))-1;} RET; }
875
+ break;
876
+ case 8:
877
+ {{p = ((te))-1;} RET; }
878
+ break;
879
+ case 9:
880
+ {{p = ((te))-1;} trunc = 1; RET; }
881
+ break;
882
+ case 10:
883
+ {{p = ((te))-1;} trunc = 2; RET; }
884
+ break;
885
+ case 11:
886
+ {{p = ((te))-1;} RET; }
887
+ break;
888
+ case 12:
889
+ {{p = ((te))-1;} RET; }
890
+ break;
891
+ case 13:
892
+ {{p = ((te))-1;} RET; }
893
+ break;
894
+ case 14:
895
+ {{p = ((te))-1;} RET; }
896
+ break;
897
+ case 15:
898
+ {{p = ((te))-1;} STRIP('.'); }
899
+ break;
900
+ case 16:
901
+ {{p = ((te))-1;} RET; }
902
+ break;
903
+ case 17:
904
+ {{p = ((te))-1;} RET; }
905
+ break;
906
+ case 19:
907
+ {{p = ((te))-1;}}
908
+ break;
909
+ }
910
+ }
911
+ break;
912
+ #line 913 "scanner.c"
913
+ }
914
+ }
915
+
916
+ _again:
917
+ _acts = _StdTok_actions + _StdTok_to_state_actions[cs];
918
+ _nacts = (unsigned int) *_acts++;
919
+ while ( _nacts-- > 0 ) {
920
+ switch ( *_acts++ ) {
921
+ case 3:
922
+ #line 1 "scanner.rl"
923
+ {ts = 0;}
924
+ break;
925
+ #line 926 "scanner.c"
926
+ }
927
+ }
928
+
929
+ if ( cs == 0 )
930
+ goto _out;
931
+ if ( ++p != pe )
932
+ goto _resume;
933
+ _test_eof: {}
934
+ if ( p == eof )
935
+ {
936
+ if ( _StdTok_eof_trans[cs] > 0 ) {
937
+ _trans = _StdTok_eof_trans[cs] - 1;
938
+ goto _eof_trans;
939
+ }
940
+ }
941
+
942
+ _out: {}
943
+ }
944
+ #line 52 "scanner.rl"
945
+
946
+ if ( cs == StdTok_error )
947
+ fprintf(stderr, "PARSE ERROR\n" );
948
+ else if ( ts ) fprintf(stderr, "STUFF LEFT: '%s'\n", ts);
949
+ return;
950
+
951
+ ret:
952
+ {
953
+ size_t __len = te - ts - skip - trunc;
954
+ if (__len > out_size)
955
+ __len = out_size;
956
+
957
+ *start = ts;
958
+ *end = te;
959
+
960
+ if (strip_char) {
961
+ char *__p = ts + skip;
962
+ char *__o = out;
963
+ for (; __p < (ts + skip + __len); ++__p) {
964
+ if (*__p != strip_char)
965
+ *__o++ = *__p;
966
+ }
967
+ *token_size = __o - out;
968
+ }
969
+ else {
970
+ memcpy(out, ts + skip, __len);
971
+ *token_size = __len;
972
+ }
973
+
974
+ out[*token_size] = 0;
975
+ }
976
+ }
@@ -0,0 +1,56 @@
1
+ #// scanner.in -*-C-*-
2
+
3
+ %%{
4
+ machine StdTok;
5
+ include URL "url.rl";
6
+ include Email "email.rl";
7
+
8
+ token = frt_alpha frt_alnum*;
9
+
10
+ frt_tokenizer := |*
11
+ #// question_mark
12
+ ('?')+ { RET; };
13
+ #// exclamation_mark
14
+ ('!')+ { RET; };
15
+ #// twitter_user
16
+ '@' (alnum)+ { RET; };
17
+ #// smile_face
18
+ ':' ('-')? (')')+ { RET; };
19
+ #// angry_face
20
+ ':' ('-')? ('(')+ { RET; };
21
+ #// fword
22
+ alnum+ ('*')+ alnum+ { RET; };
23
+
24
+ #// Email
25
+ email { RET; };
26
+
27
+ #// Token, or token with possessive
28
+ token { RET; };
29
+ token [\'] { trunc = 1; RET; };
30
+ token [\'][sS] { trunc = 2; RET; };
31
+
32
+ #// contractions
33
+ frt_alpha+ [\'] frt_alpha+ { RET; };
34
+
35
+ #// Token with hyphens
36
+ frt_alnum+ ([\-_] frt_alnum+)* { RET; };
37
+
38
+ #// Company name
39
+ token [\&\@] token* { RET; };
40
+
41
+ #// URL
42
+ url { RET; };
43
+
44
+ #// Acronym
45
+ (frt_alpha '.')+ frt_alpha { STRIP('.'); };
46
+
47
+ #// Int+float
48
+ [\-\+]?frt_digit+ { RET; };
49
+ [\-\+]?frt_digit+ '.' frt_digit+ { RET; };
50
+
51
+ #// Ignore whitespace and other crap
52
+ 0 { return; };
53
+ (any - frt_alnum) {};
54
+
55
+ *|;
56
+ }%%
@@ -0,0 +1,83 @@
1
+ /* scanner.rl -*-C-*- */
2
+ #include <ctype.h>
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+ #include <string.h>
6
+ #include <unistd.h>
7
+
8
+ #define RET goto ret;
9
+
10
+ #define STRIP(c) do { \
11
+ strip_char = c; \
12
+ goto ret; \
13
+ } while(0)
14
+
15
+ %%{
16
+ machine StdTok;
17
+ alphtype unsigned char;
18
+
19
+ frt_alpha = alpha;
20
+ frt_alnum = alnum;
21
+ frt_digit = digit;
22
+
23
+ include StdTok "scanner.in";
24
+
25
+ main := any @{ fhold; fcall frt_tokenizer; };
26
+ }%%
27
+
28
+ %% write data nofinal;
29
+
30
+ void frt_std_scan(const char *in,
31
+ char *out, size_t out_size,
32
+ const char **start,
33
+ const char **end,
34
+ int *token_size)
35
+ {
36
+ int cs, act, top;
37
+ int stack[32];
38
+ char *ts = 0, *te = 0;
39
+
40
+ %% write init;
41
+
42
+ char *p = (char *)in, *pe = 0, *eof = pe;
43
+ int skip = 0;
44
+ int trunc = 0;
45
+ char strip_char = 0;
46
+
47
+ *end = 0;
48
+ *start = 0;
49
+ *token_size = 0;
50
+
51
+ %% write exec;
52
+
53
+ if ( cs == StdTok_error )
54
+ fprintf(stderr, "PARSE ERROR\n" );
55
+ else if ( ts ) fprintf(stderr, "STUFF LEFT: '%s'\n", ts);
56
+ return;
57
+
58
+ ret:
59
+ {
60
+ size_t __len = te - ts - skip - trunc;
61
+ if (__len > out_size)
62
+ __len = out_size;
63
+
64
+ *start = ts;
65
+ *end = te;
66
+
67
+ if (strip_char) {
68
+ char *__p = ts + skip;
69
+ char *__o = out;
70
+ for (; __p < (ts + skip + __len); ++__p) {
71
+ if (*__p != strip_char)
72
+ *__o++ = *__p;
73
+ }
74
+ *token_size = __o - out;
75
+ }
76
+ else {
77
+ memcpy(out, ts + skip, __len);
78
+ *token_size = __len;
79
+ }
80
+
81
+ out[*token_size] = 0;
82
+ }
83
+ }
@@ -0,0 +1,27 @@
1
+ #// url.rl -*-C-*-
2
+
3
+ %%{
4
+ machine URL;
5
+
6
+ uword = [_] | alnum;
7
+ dword = '-' | uword;
8
+ dalnum = '-' | alnum;
9
+ proto = 'http'[s]? | 'ftp' | 'file';
10
+ urlc = alnum | [.,\/_\-\@\:];
11
+
12
+ url =
13
+ (
14
+ proto [:][/]+ %{ skip = p - ts; } dword+ ([.] uword dword*)+ |
15
+ alnum+ [:][/]+ urlc+ |
16
+
17
+ (alnum (dalnum* alnum)? [.])+ #// Subdomains
18
+ ('com' |'edu' | 'biz' | 'gov' |
19
+ 'int' | 'info' | 'mil' | 'net' |
20
+ 'org' | alpha{2})
21
+ )
22
+
23
+ #// Port
24
+ ( [:] digit+ )?
25
+
26
+ ([/]? @{ trunc = 1; });
27
+ }%%
@@ -0,0 +1,7 @@
1
+ require 'ext/ferret_tokenizer'
2
+
3
+ t = FerretTokenizer.new '@feedbackmine sh*t, I LOVE this!!! so funny:-) :('
4
+ while (tok = t.next)
5
+ p tok
6
+ end
7
+
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: feedbackmine-ferret_tokenizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - FeedbackMine
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-03-23 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.8.3
24
+ version:
25
+ description: A string tokenizer based on Ferret::Analysis::StandardAnalyzer.
26
+ email: feedbackmine@feedbackmine.com
27
+ executables: []
28
+
29
+ extensions:
30
+ - ext/extconf.rb
31
+ extra_rdoc_files:
32
+ - Manifest.txt
33
+ files:
34
+ - ext/ferret_tokenizer.c
35
+ - ext/scanner.rl
36
+ - ext/url.rl
37
+ - ext/extconf.rb
38
+ - ext/scanner.c
39
+ - ext/email.rl
40
+ - ext/scanner.in
41
+ - Rakefile
42
+ - Manifest.txt
43
+ - README.txt
44
+ - test
45
+ - test/test.rb
46
+ has_rdoc: true
47
+ homepage: http://www.tweetjobsearch.com
48
+ post_install_message:
49
+ rdoc_options:
50
+ - --main
51
+ - README.txt
52
+ require_paths:
53
+ - ext
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: "0"
59
+ version:
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: "0"
65
+ version:
66
+ requirements: []
67
+
68
+ rubyforge_project: tokenizer
69
+ rubygems_version: 1.2.0
70
+ signing_key:
71
+ specification_version: 2
72
+ summary: tokenizer
73
+ test_files: []
74
+