sishen-ferret_tokenizer 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/Manifest.txt ADDED
@@ -0,0 +1,12 @@
1
+ ext/ferret_tokenizer.c
2
+ ext/scanner.rl
3
+ ext/url.rl
4
+ ext/extconf.rb
5
+ ext/scanner.c
6
+ ext/email.rl
7
+ ext/scanner.in
8
+ Rakefile
9
+ Manifest.txt
10
+ README.txt
11
+ test/test.rb
12
+
data/README.txt ADDED
@@ -0,0 +1,20 @@
1
+ == Overview
2
+ A string tokenizer based on Ferret::Analysis::StandardAnalyzer. I made some improvements for better text analysis.
3
+
4
+ == Install
5
+
6
+ sudo gem sources -a http://gems.github.com (you only have to do this once)
7
+ sudo gem install feedbackmine-ferret_tokenizer
8
+
9
+ == Usage
10
+
11
+ require 'ferret_tokenizer'
12
+ t = FerretTokenizer.new '@feedbackmine, I LOVE this!!! :-)'
13
+ while (tok = t.next)
14
+ p tok
15
+ end
16
+
17
+ == Follow us on twitter
18
+
19
+ http://twitter.com/feedbackmine
20
+
data/Rakefile ADDED
@@ -0,0 +1,25 @@
1
+ require 'rubygems'
2
+ require 'hoe'
3
+
4
+ EXT = "ext/ferret_tokenizer.#{Hoe::DLEXT}"
5
+
6
+ Hoe.new('ferret_tokenizer', '0.1.1') do |p|
7
+ p.author = 'FeedbackMine'
8
+ p.email = 'feedbackmine@feedbackmine.com'
9
+ p.url = 'http://www.tweetjobsearch.com'
10
+ p.summary = 'A string tokenizer based on Ferret::Analysis::StandardAnalyzer.'
11
+ p.description = 'A string tokenizer based on Ferret::Analysis::StandardAnalyzer.'
12
+
13
+ p.spec_extras[:extensions] = "ext/extconf.rb"
14
+ p.clean_globs << EXT << "ext/*.o" << "ext/Makefile"
15
+ end
16
+
17
+ task :test => EXT
18
+
19
+ file EXT => ["ext/extconf.rb", "ext/ferret_tokenizer.c"] do
20
+ Dir.chdir "ext" do
21
+ sh "ragel scanner.rl -o scanner.c"
22
+ ruby "extconf.rb"
23
+ sh "make"
24
+ end
25
+ end
data/ext/email.rl ADDED
@@ -0,0 +1,21 @@
1
+ #// email.rl -*-C-*-
2
+ %%{
3
+ machine Email;
4
+
5
+ #// RFC 2822 - matching email addresses
6
+ NO_WS_CTL = ( 1..8 | 11 | 12 | 14..31 | 127 );
7
+ ASCII = 1..127;
8
+ atext = [a-zA-Z0-9!#$%&\'*+\-/=?^_`{|}~];
9
+ qtext = ( NO_WS_CTL | 33 | 35..91 | 93..126 );
10
+ dtext = ( NO_WS_CTL | 33..90 | 94..126 );
11
+ dot_atom = atext+ ('.' atext+)*;
12
+ text = ( 1..9 | 11 | 12 | 14..127 );
13
+ quoted_pair = '\\' text;
14
+ quoted_string = '"' ( qtext | quoted_pair )* '"';
15
+ domain_literal = '[' (dtext | quoted_pair)* ']';
16
+
17
+ local_part = dot_atom | quoted_string;
18
+ domain = dot_atom | domain_literal;
19
+
20
+ email = local_part '@' domain;
21
+ }%%
data/ext/extconf.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'mkmf'
2
+ $CFLAGS = "#{ENV['CFLAGS']} -Wall -O3 "
3
+ create_makefile('ferret_tokenizer')
4
+
@@ -0,0 +1,67 @@
1
+ #include <ruby.h>
2
+
3
+ extern void frt_std_scan(const char *in,
4
+ char *out, size_t out_size,
5
+ const char **start,
6
+ const char **end,
7
+ int *token_size);
8
+
9
+ typedef struct _Tokenizer {
10
+ char *data;
11
+ } Tokenizer;
12
+
13
+ Tokenizer *new_tokenizer(void)
14
+ {
15
+ return (Tokenizer*)malloc(sizeof(Tokenizer));
16
+ }
17
+
18
+ void free_tokenizer(Tokenizer *t)
19
+ {
20
+ free(t);
21
+ }
22
+
23
+ static VALUE tokenizer_alloc(VALUE klass) {
24
+ Tokenizer *tokenizer;
25
+ VALUE obj;
26
+ // Vendor library creates the Jukebox
27
+ tokenizer = new_tokenizer();
28
+ // then we wrap it inside a Ruby CDPlayer object
29
+ obj = Data_Wrap_Struct(klass, 0, free_tokenizer, tokenizer);
30
+ return obj;
31
+ }
32
+
33
+ VALUE method_next(VALUE self) {
34
+ const char *start = NULL;
35
+ const char *end = NULL;
36
+ int len;
37
+
38
+ Tokenizer *t;
39
+ Data_Get_Struct(self, Tokenizer, t);
40
+
41
+ char buffer[1024];
42
+
43
+ frt_std_scan(t->data, buffer, sizeof(buffer) - 1,
44
+ &start, &end, &len);
45
+ if (len == 0)
46
+ return Qnil;
47
+
48
+ t->data = end;
49
+ return rb_str_new2(buffer);
50
+ }
51
+
52
+ VALUE method_initialize(VALUE self, VALUE str) {
53
+ char* s = RSTRING(str)->ptr;
54
+ Tokenizer *t;
55
+ Data_Get_Struct(self, Tokenizer, t);
56
+ t->data = s;
57
+ return self;
58
+ }
59
+
60
+ VALUE cTokenizer = Qnil;
61
+
62
+ void Init_ferret_tokenizer() {
63
+ cTokenizer = rb_define_class("FerretTokenizer", rb_cObject);
64
+ rb_define_alloc_func(cTokenizer, tokenizer_alloc);
65
+ rb_define_method(cTokenizer, "initialize", method_initialize, 1);
66
+ rb_define_method(cTokenizer, "next", method_next, 0);
67
+ }
data/ext/scanner.c ADDED
@@ -0,0 +1,895 @@
1
+ #line 1 "src/scanner.rl"
2
+ /* scanner.rl -*-C-*- */
3
+ #include <ctype.h>
4
+ #include <stdio.h>
5
+ #include <stdlib.h>
6
+ #include <string.h>
7
+ #include <unistd.h>
8
+ /* #include "global.h" */
9
+ /* #include "internal.h" */
10
+
11
+ #define RET goto ret;
12
+
13
+ #define STRIP(c) do { \
14
+ strip_char = c; \
15
+ goto ret; \
16
+ } while(0)
17
+
18
+ #line 28 "src/scanner.rl"
19
+
20
+
21
+
22
+ #line 23 "src/scanner.c"
23
+ static const char _StdTok_actions[] = {
24
+ 0, 1, 0, 1, 1, 1, 2, 1,
25
+ 3, 1, 4, 1, 5, 1, 17, 1,
26
+ 19, 1, 20, 1, 21, 1, 22, 1,
27
+ 23, 1, 24, 1, 25, 1, 26, 1,
28
+ 27, 1, 28, 1, 29, 1, 30, 1,
29
+ 31, 1, 32, 1, 33, 1, 34, 1,
30
+ 35, 1, 36, 1, 37, 2, 1, 18,
31
+ 2, 5, 6, 2, 5, 7, 2, 5,
32
+ 8, 2, 5, 9, 2, 5, 10, 2,
33
+ 5, 11, 2, 5, 12, 2, 5, 13,
34
+ 2, 5, 14, 2, 5, 15, 2, 5,
35
+ 16, 3, 5, 1, 12
36
+ };
37
+
38
+ static const short _StdTok_key_offsets[] = {
39
+ 0, 0, 0, 14, 28, 43, 57, 69,
40
+ 75, 85, 86, 92, 107, 128, 149, 170,
41
+ 197, 218, 220, 242, 264, 286, 308, 330,
42
+ 352, 374, 396, 418, 439, 466, 467, 474,
43
+ 501, 502, 515, 515, 537, 551, 565, 575,
44
+ 591, 607, 623, 646, 668, 671, 694, 717,
45
+ 740, 763, 787, 810, 833, 856, 878, 900,
46
+ 921, 929, 952, 977, 996, 1017, 1038, 1057,
47
+ 1078, 1100, 1122, 1144, 1166, 1188, 1210, 1232,
48
+ 1254, 1279, 1295, 1314, 1334, 1359, 1386, 1412,
49
+ 1438, 1463, 1476, 1502, 1528, 1554, 1580
50
+ };
51
+
52
+ static const unsigned char _StdTok_trans_keys[] = {
53
+ 33u, 46u, 61u, 64u, 35u, 39u, 42u, 43u,
54
+ 45u, 57u, 63u, 90u, 94u, 126u, 33u, 45u,
55
+ 61u, 63u, 35u, 39u, 42u, 43u, 47u, 57u,
56
+ 65u, 90u, 94u, 126u, 33u, 45u, 61u, 63u,
57
+ 91u, 35u, 39u, 42u, 43u, 47u, 57u, 65u,
58
+ 90u, 94u, 126u, 33u, 45u, 61u, 63u, 35u,
59
+ 39u, 42u, 43u, 47u, 57u, 65u, 90u, 94u,
60
+ 126u, 92u, 93u, 1u, 8u, 11u, 12u, 14u,
61
+ 31u, 33u, 90u, 94u, 127u, 1u, 9u, 11u,
62
+ 12u, 14u, 127u, 34u, 92u, 1u, 8u, 11u,
63
+ 12u, 14u, 31u, 33u, 127u, 64u, 1u, 9u,
64
+ 11u, 12u, 14u, 127u, 33u, 45u, 47u, 61u,
65
+ 63u, 35u, 39u, 42u, 43u, 48u, 57u, 65u,
66
+ 90u, 94u, 126u, 33u, 45u, 46u, 47u, 61u,
67
+ 63u, 64u, 35u, 39u, 42u, 43u, 48u, 57u,
68
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
69
+ 33u, 45u, 46u, 47u, 61u, 63u, 64u, 35u,
70
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
71
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
72
+ 47u, 61u, 63u, 64u, 35u, 39u, 42u, 43u,
73
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
74
+ 123u, 126u, 33u, 45u, 47u, 61u, 63u, 98u,
75
+ 99u, 101u, 103u, 105u, 109u, 110u, 111u, 35u,
76
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
77
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
78
+ 47u, 61u, 63u, 64u, 35u, 39u, 42u, 43u,
79
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
80
+ 123u, 126u, 48u, 57u, 33u, 45u, 46u, 47u,
81
+ 61u, 63u, 64u, 105u, 35u, 39u, 42u, 43u,
82
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
83
+ 123u, 126u, 33u, 45u, 46u, 47u, 61u, 63u,
84
+ 64u, 111u, 35u, 39u, 42u, 43u, 48u, 57u,
85
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
86
+ 33u, 45u, 46u, 47u, 61u, 63u, 64u, 100u,
87
+ 35u, 39u, 42u, 43u, 48u, 57u, 65u, 90u,
88
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 45u,
89
+ 46u, 47u, 61u, 63u, 64u, 111u, 35u, 39u,
90
+ 42u, 43u, 48u, 57u, 65u, 90u, 94u, 96u,
91
+ 97u, 122u, 123u, 126u, 33u, 45u, 46u, 47u,
92
+ 61u, 63u, 64u, 110u, 35u, 39u, 42u, 43u,
93
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
94
+ 123u, 126u, 33u, 45u, 46u, 47u, 61u, 63u,
95
+ 64u, 111u, 35u, 39u, 42u, 43u, 48u, 57u,
96
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
97
+ 33u, 45u, 46u, 47u, 61u, 63u, 64u, 105u,
98
+ 35u, 39u, 42u, 43u, 48u, 57u, 65u, 90u,
99
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 45u,
100
+ 46u, 47u, 61u, 63u, 64u, 101u, 35u, 39u,
101
+ 42u, 43u, 48u, 57u, 65u, 90u, 94u, 96u,
102
+ 97u, 122u, 123u, 126u, 33u, 45u, 46u, 47u,
103
+ 61u, 63u, 64u, 114u, 35u, 39u, 42u, 43u,
104
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
105
+ 123u, 126u, 33u, 46u, 61u, 63u, 64u, 35u,
106
+ 39u, 42u, 43u, 45u, 47u, 48u, 57u, 65u,
107
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
108
+ 45u, 47u, 61u, 63u, 98u, 99u, 101u, 103u,
109
+ 105u, 109u, 110u, 111u, 35u, 39u, 42u, 43u,
110
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
111
+ 123u, 126u, 47u, 95u, 44u, 58u, 64u, 90u,
112
+ 97u, 122u, 33u, 45u, 47u, 61u, 63u, 98u,
113
+ 99u, 101u, 103u, 105u, 109u, 110u, 111u, 35u,
114
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
115
+ 96u, 97u, 122u, 123u, 126u, 47u, 45u, 47u,
116
+ 58u, 64u, 95u, 44u, 46u, 48u, 57u, 65u,
117
+ 90u, 97u, 122u, 0u, 34u, 42u, 43u, 45u,
118
+ 47u, 61u, 63u, 102u, 104u, 33u, 39u, 48u,
119
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
120
+ 126u, 33u, 46u, 61u, 64u, 35u, 39u, 42u,
121
+ 43u, 45u, 57u, 63u, 90u, 94u, 126u, 33u,
122
+ 46u, 61u, 63u, 35u, 39u, 42u, 43u, 45u,
123
+ 57u, 65u, 90u, 94u, 126u, 34u, 92u, 1u,
124
+ 8u, 11u, 12u, 14u, 31u, 33u, 127u, 33u,
125
+ 46u, 61u, 64u, 35u, 39u, 42u, 43u, 45u,
126
+ 47u, 48u, 57u, 63u, 90u, 94u, 126u, 33u,
127
+ 46u, 61u, 64u, 35u, 39u, 42u, 43u, 45u,
128
+ 47u, 48u, 57u, 63u, 90u, 94u, 126u, 33u,
129
+ 46u, 61u, 64u, 35u, 39u, 42u, 43u, 45u,
130
+ 47u, 48u, 57u, 63u, 90u, 94u, 126u, 33u,
131
+ 45u, 46u, 47u, 58u, 61u, 63u, 64u, 95u,
132
+ 35u, 39u, 42u, 43u, 48u, 57u, 65u, 90u,
133
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 45u,
134
+ 46u, 47u, 58u, 61u, 63u, 64u, 35u, 39u,
135
+ 42u, 43u, 48u, 57u, 65u, 90u, 94u, 96u,
136
+ 97u, 122u, 123u, 126u, 47u, 48u, 57u, 33u,
137
+ 45u, 46u, 47u, 58u, 61u, 63u, 64u, 122u,
138
+ 35u, 39u, 42u, 43u, 48u, 57u, 65u, 90u,
139
+ 94u, 96u, 97u, 121u, 123u, 126u, 33u, 45u,
140
+ 46u, 47u, 58u, 61u, 63u, 64u, 109u, 35u,
141
+ 39u, 42u, 43u, 48u, 57u, 65u, 90u, 94u,
142
+ 96u, 97u, 122u, 123u, 126u, 33u, 45u, 46u,
143
+ 47u, 58u, 61u, 63u, 64u, 117u, 35u, 39u,
144
+ 42u, 43u, 48u, 57u, 65u, 90u, 94u, 96u,
145
+ 97u, 122u, 123u, 126u, 33u, 45u, 46u, 47u,
146
+ 58u, 61u, 63u, 64u, 118u, 35u, 39u, 42u,
147
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
148
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 58u,
149
+ 61u, 63u, 64u, 102u, 116u, 35u, 39u, 42u,
150
+ 43u, 48u, 57u, 65u, 90u, 94u, 96u, 97u,
151
+ 122u, 123u, 126u, 33u, 45u, 46u, 47u, 58u,
152
+ 61u, 63u, 64u, 108u, 35u, 39u, 42u, 43u,
153
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
154
+ 123u, 126u, 33u, 45u, 46u, 47u, 58u, 61u,
155
+ 63u, 64u, 116u, 35u, 39u, 42u, 43u, 48u,
156
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
157
+ 126u, 33u, 45u, 46u, 47u, 58u, 61u, 63u,
158
+ 64u, 103u, 35u, 39u, 42u, 43u, 48u, 57u,
159
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
160
+ 33u, 45u, 46u, 47u, 61u, 63u, 64u, 95u,
161
+ 35u, 39u, 42u, 43u, 48u, 57u, 65u, 90u,
162
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 45u,
163
+ 46u, 47u, 61u, 63u, 64u, 95u, 35u, 39u,
164
+ 42u, 43u, 48u, 57u, 65u, 90u, 94u, 96u,
165
+ 97u, 122u, 123u, 126u, 33u, 45u, 46u, 47u,
166
+ 61u, 63u, 64u, 35u, 39u, 42u, 43u, 48u,
167
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
168
+ 126u, 47u, 95u, 44u, 58u, 64u, 90u, 97u,
169
+ 122u, 33u, 45u, 46u, 47u, 58u, 61u, 63u,
170
+ 64u, 95u, 35u, 39u, 42u, 43u, 48u, 57u,
171
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
172
+ 33u, 38u, 39u, 45u, 46u, 47u, 58u, 61u,
173
+ 63u, 64u, 95u, 35u, 37u, 42u, 43u, 48u,
174
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
175
+ 126u, 33u, 46u, 61u, 63u, 64u, 35u, 39u,
176
+ 42u, 43u, 45u, 57u, 65u, 90u, 94u, 96u,
177
+ 97u, 122u, 123u, 126u, 33u, 46u, 61u, 63u,
178
+ 64u, 35u, 39u, 42u, 43u, 45u, 47u, 48u,
179
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
180
+ 126u, 33u, 46u, 61u, 63u, 64u, 83u, 115u,
181
+ 35u, 39u, 42u, 43u, 45u, 57u, 65u, 90u,
182
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 46u,
183
+ 61u, 63u, 64u, 35u, 39u, 42u, 43u, 45u,
184
+ 57u, 65u, 90u, 94u, 96u, 97u, 122u, 123u,
185
+ 126u, 33u, 45u, 46u, 47u, 61u, 63u, 64u,
186
+ 35u, 39u, 42u, 43u, 48u, 57u, 65u, 90u,
187
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 45u,
188
+ 46u, 47u, 61u, 63u, 64u, 105u, 35u, 39u,
189
+ 42u, 43u, 48u, 57u, 65u, 90u, 94u, 96u,
190
+ 97u, 122u, 123u, 126u, 33u, 45u, 46u, 47u,
191
+ 61u, 63u, 64u, 111u, 35u, 39u, 42u, 43u,
192
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
193
+ 123u, 126u, 33u, 45u, 46u, 47u, 61u, 63u,
194
+ 64u, 100u, 35u, 39u, 42u, 43u, 48u, 57u,
195
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
196
+ 33u, 45u, 46u, 47u, 61u, 63u, 64u, 111u,
197
+ 35u, 39u, 42u, 43u, 48u, 57u, 65u, 90u,
198
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 45u,
199
+ 46u, 47u, 61u, 63u, 64u, 110u, 35u, 39u,
200
+ 42u, 43u, 48u, 57u, 65u, 90u, 94u, 96u,
201
+ 97u, 122u, 123u, 126u, 33u, 45u, 46u, 47u,
202
+ 61u, 63u, 64u, 105u, 35u, 39u, 42u, 43u,
203
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
204
+ 123u, 126u, 33u, 45u, 46u, 47u, 61u, 63u,
205
+ 64u, 101u, 35u, 39u, 42u, 43u, 48u, 57u,
206
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
207
+ 33u, 45u, 46u, 47u, 61u, 63u, 64u, 114u,
208
+ 35u, 39u, 42u, 43u, 48u, 57u, 65u, 90u,
209
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 38u,
210
+ 39u, 45u, 46u, 47u, 58u, 61u, 63u, 64u,
211
+ 95u, 35u, 37u, 42u, 43u, 48u, 57u, 65u,
212
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
213
+ 46u, 61u, 64u, 83u, 115u, 35u, 39u, 42u,
214
+ 43u, 45u, 57u, 63u, 90u, 94u, 126u, 33u,
215
+ 45u, 61u, 63u, 91u, 35u, 39u, 42u, 43u,
216
+ 47u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
217
+ 123u, 126u, 33u, 46u, 61u, 63u, 35u, 39u,
218
+ 42u, 43u, 45u, 47u, 48u, 57u, 65u, 90u,
219
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 38u,
220
+ 39u, 45u, 46u, 47u, 58u, 61u, 63u, 64u,
221
+ 95u, 35u, 37u, 42u, 43u, 48u, 57u, 65u,
222
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 33u,
223
+ 38u, 39u, 45u, 46u, 47u, 58u, 61u, 63u,
224
+ 64u, 95u, 105u, 116u, 35u, 37u, 42u, 43u,
225
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
226
+ 123u, 126u, 33u, 38u, 39u, 45u, 46u, 47u,
227
+ 58u, 61u, 63u, 64u, 95u, 108u, 35u, 37u,
228
+ 42u, 43u, 48u, 57u, 65u, 90u, 94u, 96u,
229
+ 97u, 122u, 123u, 126u, 33u, 38u, 39u, 45u,
230
+ 46u, 47u, 58u, 61u, 63u, 64u, 95u, 101u,
231
+ 35u, 37u, 42u, 43u, 48u, 57u, 65u, 90u,
232
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 38u,
233
+ 39u, 45u, 46u, 47u, 58u, 61u, 63u, 64u,
234
+ 95u, 35u, 37u, 42u, 43u, 48u, 57u, 65u,
235
+ 90u, 94u, 96u, 97u, 122u, 123u, 126u, 45u,
236
+ 47u, 58u, 64u, 95u, 44u, 46u, 48u, 57u,
237
+ 65u, 90u, 97u, 122u, 33u, 38u, 39u, 45u,
238
+ 46u, 47u, 58u, 61u, 63u, 64u, 95u, 112u,
239
+ 35u, 37u, 42u, 43u, 48u, 57u, 65u, 90u,
240
+ 94u, 96u, 97u, 122u, 123u, 126u, 33u, 38u,
241
+ 39u, 45u, 46u, 47u, 58u, 61u, 63u, 64u,
242
+ 95u, 116u, 35u, 37u, 42u, 43u, 48u, 57u,
243
+ 65u, 90u, 94u, 96u, 97u, 122u, 123u, 126u,
244
+ 33u, 38u, 39u, 45u, 46u, 47u, 58u, 61u,
245
+ 63u, 64u, 95u, 116u, 35u, 37u, 42u, 43u,
246
+ 48u, 57u, 65u, 90u, 94u, 96u, 97u, 122u,
247
+ 123u, 126u, 33u, 38u, 39u, 45u, 46u, 47u,
248
+ 58u, 61u, 63u, 64u, 95u, 112u, 35u, 37u,
249
+ 42u, 43u, 48u, 57u, 65u, 90u, 94u, 96u,
250
+ 97u, 122u, 123u, 126u, 33u, 38u, 39u, 45u,
251
+ 46u, 47u, 58u, 61u, 63u, 64u, 95u, 115u,
252
+ 35u, 37u, 42u, 43u, 48u, 57u, 65u, 90u,
253
+ 94u, 96u, 97u, 122u, 123u, 126u, 0
254
+ };
255
+
256
+ static const char _StdTok_single_lengths[] = {
257
+ 0, 0, 4, 4, 5, 4, 2, 0,
258
+ 2, 1, 0, 5, 7, 7, 7, 13,
259
+ 7, 0, 8, 8, 8, 8, 8, 8,
260
+ 8, 8, 8, 5, 13, 1, 1, 13,
261
+ 1, 5, 0, 10, 4, 4, 2, 4,
262
+ 4, 4, 9, 8, 1, 9, 9, 9,
263
+ 9, 10, 9, 9, 9, 8, 8, 7,
264
+ 2, 9, 11, 5, 5, 7, 5, 7,
265
+ 8, 8, 8, 8, 8, 8, 8, 8,
266
+ 11, 6, 5, 4, 11, 13, 12, 12,
267
+ 11, 5, 12, 12, 12, 12, 12
268
+ };
269
+
270
+ static const char _StdTok_range_lengths[] = {
271
+ 0, 0, 5, 5, 5, 5, 5, 3,
272
+ 4, 0, 3, 5, 7, 7, 7, 7,
273
+ 7, 1, 7, 7, 7, 7, 7, 7,
274
+ 7, 7, 7, 8, 7, 0, 3, 7,
275
+ 0, 4, 0, 6, 5, 5, 4, 6,
276
+ 6, 6, 7, 7, 1, 7, 7, 7,
277
+ 7, 7, 7, 7, 7, 7, 7, 7,
278
+ 3, 7, 7, 7, 8, 7, 7, 7,
279
+ 7, 7, 7, 7, 7, 7, 7, 7,
280
+ 7, 5, 7, 8, 7, 7, 7, 7,
281
+ 7, 4, 7, 7, 7, 7, 7
282
+ };
283
+
284
+ static const short _StdTok_index_offsets[] = {
285
+ 0, 0, 1, 11, 21, 32, 42, 50,
286
+ 54, 61, 63, 67, 78, 93, 108, 123,
287
+ 144, 159, 161, 177, 193, 209, 225, 241,
288
+ 257, 273, 289, 305, 319, 340, 342, 347,
289
+ 368, 370, 380, 381, 398, 408, 418, 425,
290
+ 436, 447, 458, 475, 491, 494, 511, 528,
291
+ 545, 562, 580, 597, 614, 631, 647, 663,
292
+ 678, 684, 701, 720, 733, 747, 762, 775,
293
+ 790, 806, 822, 838, 854, 870, 886, 902,
294
+ 918, 937, 949, 962, 975, 994, 1015, 1035,
295
+ 1055, 1074, 1084, 1104, 1124, 1144, 1164
296
+ };
297
+
298
+ static const char _StdTok_indicies[] = {
299
+ 0, 2, 3, 2, 4, 2, 2, 2,
300
+ 2, 2, 1, 2, 2, 2, 2, 2,
301
+ 2, 2, 2, 2, 1, 5, 5, 5,
302
+ 5, 6, 5, 5, 5, 5, 5, 1,
303
+ 5, 5, 5, 5, 5, 5, 5, 5,
304
+ 5, 7, 8, 9, 6, 6, 6, 6,
305
+ 6, 1, 6, 6, 6, 1, 12, 13,
306
+ 11, 11, 11, 11, 10, 4, 10, 11,
307
+ 11, 11, 10, 2, 2, 2, 2, 2,
308
+ 2, 2, 15, 2, 2, 14, 2, 16,
309
+ 3, 2, 2, 2, 4, 2, 2, 17,
310
+ 17, 2, 17, 2, 1, 2, 16, 3,
311
+ 2, 2, 2, 4, 2, 2, 18, 18,
312
+ 2, 18, 2, 1, 2, 16, 19, 2,
313
+ 2, 2, 4, 2, 2, 18, 18, 2,
314
+ 18, 2, 1, 2, 2, 2, 2, 2,
315
+ 21, 22, 23, 24, 25, 26, 27, 28,
316
+ 2, 2, 18, 20, 2, 20, 2, 1,
317
+ 2, 16, 19, 2, 2, 2, 4, 2,
318
+ 2, 18, 29, 2, 29, 2, 1, 31,
319
+ 30, 2, 16, 19, 2, 2, 2, 4,
320
+ 32, 2, 2, 18, 29, 2, 29, 2,
321
+ 1, 2, 16, 19, 2, 2, 2, 4,
322
+ 33, 2, 2, 18, 29, 2, 29, 2,
323
+ 1, 2, 16, 19, 2, 2, 2, 4,
324
+ 34, 2, 2, 18, 29, 2, 29, 2,
325
+ 1, 2, 16, 19, 2, 2, 2, 4,
326
+ 35, 2, 2, 18, 29, 2, 29, 2,
327
+ 1, 2, 16, 19, 2, 2, 2, 4,
328
+ 36, 2, 2, 18, 29, 2, 29, 2,
329
+ 1, 2, 16, 19, 2, 2, 2, 4,
330
+ 29, 2, 2, 18, 18, 2, 18, 2,
331
+ 30, 2, 16, 19, 2, 2, 2, 4,
332
+ 37, 2, 2, 18, 29, 2, 29, 2,
333
+ 1, 2, 16, 19, 2, 2, 2, 4,
334
+ 38, 2, 2, 18, 29, 2, 29, 2,
335
+ 1, 2, 16, 19, 2, 2, 2, 4,
336
+ 39, 2, 2, 18, 29, 2, 29, 2,
337
+ 1, 2, 3, 2, 2, 4, 2, 2,
338
+ 2, 40, 40, 2, 40, 2, 1, 2,
339
+ 2, 2, 2, 2, 21, 22, 23, 24,
340
+ 25, 26, 27, 28, 2, 2, 42, 20,
341
+ 2, 20, 2, 41, 43, 1, 44, 44,
342
+ 44, 44, 1, 2, 2, 2, 2, 2,
343
+ 46, 47, 48, 49, 50, 51, 52, 53,
344
+ 2, 2, 18, 45, 2, 45, 2, 1,
345
+ 55, 54, 56, 57, 44, 44, 56, 44,
346
+ 56, 56, 56, 54, 58, 59, 62, 61,
347
+ 63, 63, 61, 61, 61, 66, 67, 61,
348
+ 64, 65, 61, 65, 61, 60, 2, 3,
349
+ 2, 4, 2, 2, 2, 2, 2, 1,
350
+ 5, 69, 5, 5, 5, 5, 5, 5,
351
+ 5, 68, 12, 13, 11, 11, 11, 11,
352
+ 70, 2, 3, 2, 4, 2, 2, 2,
353
+ 71, 2, 2, 70, 2, 73, 2, 4,
354
+ 2, 2, 2, 71, 2, 2, 72, 2,
355
+ 3, 2, 4, 2, 2, 2, 15, 2,
356
+ 2, 74, 2, 76, 77, 2, 78, 2,
357
+ 2, 4, 80, 2, 2, 64, 79, 2,
358
+ 79, 2, 75, 2, 16, 19, 82, 83,
359
+ 2, 2, 4, 2, 2, 18, 18, 2,
360
+ 18, 2, 81, 84, 31, 81, 2, 16,
361
+ 19, 82, 83, 2, 2, 4, 29, 2,
362
+ 2, 18, 18, 2, 18, 2, 81, 2,
363
+ 16, 19, 82, 83, 2, 2, 4, 29,
364
+ 2, 2, 18, 18, 2, 18, 2, 81,
365
+ 2, 16, 19, 82, 83, 2, 2, 4,
366
+ 29, 2, 2, 18, 18, 2, 18, 2,
367
+ 81, 2, 16, 19, 82, 83, 2, 2,
368
+ 4, 29, 2, 2, 18, 18, 2, 18,
369
+ 2, 81, 2, 16, 19, 82, 83, 2,
370
+ 2, 4, 85, 29, 2, 2, 18, 18,
371
+ 2, 18, 2, 81, 2, 16, 19, 82,
372
+ 83, 2, 2, 4, 29, 2, 2, 18,
373
+ 18, 2, 18, 2, 81, 2, 16, 19,
374
+ 82, 83, 2, 2, 4, 29, 2, 2,
375
+ 18, 18, 2, 18, 2, 81, 2, 16,
376
+ 19, 82, 83, 2, 2, 4, 29, 2,
377
+ 2, 18, 18, 2, 18, 2, 81, 2,
378
+ 76, 19, 2, 2, 2, 4, 80, 2,
379
+ 2, 17, 17, 2, 17, 2, 75, 2,
380
+ 80, 3, 2, 2, 2, 4, 80, 2,
381
+ 2, 40, 40, 2, 40, 2, 75, 2,
382
+ 16, 19, 2, 2, 2, 4, 2, 2,
383
+ 42, 18, 2, 18, 2, 74, 86, 44,
384
+ 44, 44, 44, 81, 2, 76, 19, 2,
385
+ 78, 2, 2, 4, 80, 2, 2, 79,
386
+ 79, 2, 79, 2, 75, 2, 88, 89,
387
+ 76, 90, 2, 78, 2, 2, 92, 80,
388
+ 2, 2, 91, 93, 2, 93, 2, 87,
389
+ 2, 3, 2, 2, 4, 2, 2, 2,
390
+ 95, 2, 95, 2, 94, 2, 3, 2,
391
+ 2, 4, 2, 2, 2, 95, 95, 2,
392
+ 95, 2, 94, 2, 3, 2, 2, 4,
393
+ 98, 98, 2, 2, 2, 97, 2, 97,
394
+ 2, 96, 2, 3, 2, 2, 4, 2,
395
+ 2, 2, 97, 2, 97, 2, 1, 2,
396
+ 16, 90, 2, 2, 2, 4, 2, 2,
397
+ 18, 29, 2, 29, 2, 99, 2, 16,
398
+ 90, 2, 2, 2, 4, 32, 2, 2,
399
+ 18, 29, 2, 29, 2, 99, 2, 16,
400
+ 90, 2, 2, 2, 4, 33, 2, 2,
401
+ 18, 29, 2, 29, 2, 99, 2, 16,
402
+ 90, 2, 2, 2, 4, 34, 2, 2,
403
+ 18, 29, 2, 29, 2, 99, 2, 16,
404
+ 90, 2, 2, 2, 4, 35, 2, 2,
405
+ 18, 29, 2, 29, 2, 99, 2, 16,
406
+ 90, 2, 2, 2, 4, 36, 2, 2,
407
+ 18, 29, 2, 29, 2, 99, 2, 16,
408
+ 90, 2, 2, 2, 4, 37, 2, 2,
409
+ 18, 29, 2, 29, 2, 99, 2, 16,
410
+ 90, 2, 2, 2, 4, 38, 2, 2,
411
+ 18, 29, 2, 29, 2, 99, 2, 16,
412
+ 90, 2, 2, 2, 4, 39, 2, 2,
413
+ 18, 29, 2, 29, 2, 99, 2, 88,
414
+ 100, 76, 19, 2, 78, 2, 2, 92,
415
+ 80, 2, 2, 91, 91, 2, 91, 2,
416
+ 87, 2, 3, 2, 4, 101, 101, 2,
417
+ 2, 2, 2, 2, 96, 5, 5, 5,
418
+ 5, 6, 5, 5, 5, 102, 5, 102,
419
+ 5, 94, 5, 69, 5, 5, 5, 5,
420
+ 5, 102, 102, 5, 102, 5, 68, 2,
421
+ 88, 89, 76, 19, 2, 78, 2, 2,
422
+ 92, 80, 2, 2, 91, 93, 2, 93,
423
+ 2, 87, 2, 88, 89, 76, 90, 2,
424
+ 78, 2, 2, 92, 80, 103, 104, 2,
425
+ 2, 91, 93, 2, 93, 2, 87, 2,
426
+ 88, 89, 76, 19, 2, 78, 2, 2,
427
+ 92, 80, 105, 2, 2, 91, 93, 2,
428
+ 93, 2, 87, 2, 88, 89, 76, 19,
429
+ 2, 78, 2, 2, 92, 80, 106, 2,
430
+ 2, 91, 93, 2, 93, 2, 87, 2,
431
+ 88, 89, 76, 19, 2, 107, 2, 2,
432
+ 92, 80, 2, 2, 91, 93, 2, 93,
433
+ 2, 87, 56, 108, 44, 44, 56, 44,
434
+ 56, 56, 56, 81, 2, 88, 89, 76,
435
+ 19, 2, 78, 2, 2, 92, 80, 106,
436
+ 2, 2, 91, 93, 2, 93, 2, 87,
437
+ 2, 88, 89, 76, 90, 2, 78, 2,
438
+ 2, 92, 80, 109, 2, 2, 91, 93,
439
+ 2, 93, 2, 87, 2, 88, 89, 76,
440
+ 19, 2, 78, 2, 2, 92, 80, 110,
441
+ 2, 2, 91, 93, 2, 93, 2, 87,
442
+ 2, 88, 89, 76, 19, 2, 78, 2,
443
+ 2, 92, 80, 111, 2, 2, 91, 93,
444
+ 2, 93, 2, 87, 2, 88, 89, 76,
445
+ 19, 2, 107, 2, 2, 92, 80, 106,
446
+ 2, 2, 91, 93, 2, 93, 2, 87,
447
+ 0
448
+ };
449
+
450
+ static const char _StdTok_trans_targs[] = {
451
+ 34, 35, 2, 3, 4, 37, 6, 35,
452
+ 7, 35, 35, 8, 9, 10, 35, 41,
453
+ 13, 53, 14, 15, 16, 18, 19, 20,
454
+ 21, 22, 24, 25, 26, 43, 35, 44,
455
+ 45, 46, 47, 48, 49, 50, 51, 52,
456
+ 54, 35, 55, 30, 56, 63, 64, 65,
457
+ 66, 67, 68, 69, 70, 71, 35, 33,
458
+ 56, 81, 0, 35, 35, 36, 38, 39,
459
+ 42, 58, 77, 83, 35, 5, 35, 40,
460
+ 35, 11, 35, 35, 12, 28, 29, 57,
461
+ 27, 35, 36, 17, 35, 23, 56, 35,
462
+ 59, 61, 31, 72, 74, 76, 35, 60,
463
+ 35, 62, 62, 35, 73, 36, 75, 78,
464
+ 82, 79, 80, 32, 81, 84, 85, 86
465
+ };
466
+
467
+ static const char _StdTok_trans_actions[] = {
468
+ 5, 51, 0, 0, 0, 11, 0, 39,
469
+ 0, 13, 49, 0, 0, 0, 47, 83,
470
+ 0, 68, 0, 0, 0, 0, 0, 0,
471
+ 0, 0, 0, 0, 0, 74, 45, 0,
472
+ 74, 74, 74, 74, 74, 74, 74, 74,
473
+ 68, 43, 83, 0, 0, 77, 77, 77,
474
+ 77, 77, 77, 77, 77, 77, 41, 0,
475
+ 1, 0, 0, 15, 17, 86, 86, 86,
476
+ 68, 56, 56, 56, 19, 0, 37, 80,
477
+ 33, 0, 35, 25, 0, 0, 0, 68,
478
+ 0, 29, 89, 0, 53, 0, 3, 21,
479
+ 71, 59, 0, 56, 71, 56, 27, 71,
480
+ 23, 65, 62, 31, 59, 62, 11, 56,
481
+ 56, 56, 56, 0, 3, 56, 56, 56
482
+ };
483
+
484
+ static const char _StdTok_to_state_actions[] = {
485
+ 0, 7, 0, 0, 0, 0, 0, 0,
486
+ 0, 0, 0, 0, 0, 0, 0, 0,
487
+ 0, 0, 0, 0, 0, 0, 0, 0,
488
+ 0, 0, 0, 0, 0, 0, 0, 0,
489
+ 0, 0, 7, 7, 0, 0, 0, 0,
490
+ 0, 0, 0, 0, 0, 0, 0, 0,
491
+ 0, 0, 0, 0, 0, 0, 0, 0,
492
+ 0, 0, 0, 0, 0, 0, 0, 0,
493
+ 0, 0, 0, 0, 0, 0, 0, 0,
494
+ 0, 0, 0, 0, 0, 0, 0, 0,
495
+ 0, 0, 0, 0, 0, 0, 0
496
+ };
497
+
498
+ static const char _StdTok_from_state_actions[] = {
499
+ 0, 0, 0, 0, 0, 0, 0, 0,
500
+ 0, 0, 0, 0, 0, 0, 0, 0,
501
+ 0, 0, 0, 0, 0, 0, 0, 0,
502
+ 0, 0, 0, 0, 0, 0, 0, 0,
503
+ 0, 0, 0, 9, 0, 0, 0, 0,
504
+ 0, 0, 0, 0, 0, 0, 0, 0,
505
+ 0, 0, 0, 0, 0, 0, 0, 0,
506
+ 0, 0, 0, 0, 0, 0, 0, 0,
507
+ 0, 0, 0, 0, 0, 0, 0, 0,
508
+ 0, 0, 0, 0, 0, 0, 0, 0,
509
+ 0, 0, 0, 0, 0, 0, 0
510
+ };
511
+
512
+ static const short _StdTok_eof_trans[] = {
513
+ 0, 0, 2, 2, 2, 8, 2, 2,
514
+ 11, 11, 11, 15, 2, 2, 2, 2,
515
+ 2, 31, 2, 2, 2, 2, 2, 31,
516
+ 2, 2, 2, 2, 42, 2, 2, 2,
517
+ 55, 55, 0, 0, 2, 69, 71, 71,
518
+ 73, 75, 76, 82, 82, 82, 82, 82,
519
+ 82, 82, 82, 82, 82, 76, 76, 75,
520
+ 82, 76, 88, 95, 95, 97, 2, 100,
521
+ 100, 100, 100, 100, 100, 100, 100, 100,
522
+ 88, 97, 95, 69, 88, 88, 88, 88,
523
+ 88, 82, 88, 88, 88, 88, 88
524
+ };
525
+
526
+ static const int StdTok_start = 1;
527
+ static const int StdTok_error = 0;
528
+
529
+ static const int StdTok_en_frt_tokenizer = 35;
530
+ static const int StdTok_en_main = 1;
531
+
532
+ #line 31 "src/scanner.rl"
533
+
534
+ void frt_std_scan(const char *in,
535
+ char *out, size_t out_size,
536
+ const char **start,
537
+ const char **end,
538
+ int *token_size)
539
+ {
540
+ int cs, act, top;
541
+ int stack[32];
542
+ char *ts = 0, *te = 0;
543
+
544
+
545
+ #line 546 "src/scanner.c"
546
+ {
547
+ cs = StdTok_start;
548
+ top = 0;
549
+ ts = 0;
550
+ te = 0;
551
+ act = 0;
552
+ }
553
+ #line 43 "src/scanner.rl"
554
+
555
+ char *p = (char *)in, *pe = 0, *eof = pe;
556
+ int skip = 0;
557
+ int trunc = 0;
558
+ char strip_char = 0;
559
+
560
+ *end = 0;
561
+ *start = 0;
562
+ *token_size = 0;
563
+
564
+
565
+ #line 566 "src/scanner.c"
566
+ {
567
+ int _klen;
568
+ unsigned int _trans;
569
+ const char *_acts;
570
+ unsigned int _nacts;
571
+ const unsigned char *_keys;
572
+
573
+ if ( p == pe )
574
+ goto _test_eof;
575
+ if ( cs == 0 )
576
+ goto _out;
577
+ _resume:
578
+ _acts = _StdTok_actions + _StdTok_from_state_actions[cs];
579
+ _nacts = (unsigned int) *_acts++;
580
+ while ( _nacts-- > 0 ) {
581
+ switch ( *_acts++ ) {
582
+ case 4:
583
+ #line 1 "src/scanner.rl"
584
+ {ts = p;}
585
+ break;
586
+ #line 587 "src/scanner.c"
587
+ }
588
+ }
589
+
590
+ _keys = _StdTok_trans_keys + _StdTok_key_offsets[cs];
591
+ _trans = _StdTok_index_offsets[cs];
592
+
593
+ _klen = _StdTok_single_lengths[cs];
594
+ if ( _klen > 0 ) {
595
+ const unsigned char *_lower = _keys;
596
+ const unsigned char *_mid;
597
+ const unsigned char *_upper = _keys + _klen - 1;
598
+ while (1) {
599
+ if ( _upper < _lower )
600
+ break;
601
+
602
+ _mid = _lower + ((_upper-_lower) >> 1);
603
+ if ( (*p) < *_mid )
604
+ _upper = _mid - 1;
605
+ else if ( (*p) > *_mid )
606
+ _lower = _mid + 1;
607
+ else {
608
+ _trans += (_mid - _keys);
609
+ goto _match;
610
+ }
611
+ }
612
+ _keys += _klen;
613
+ _trans += _klen;
614
+ }
615
+
616
+ _klen = _StdTok_range_lengths[cs];
617
+ if ( _klen > 0 ) {
618
+ const unsigned char *_lower = _keys;
619
+ const unsigned char *_mid;
620
+ const unsigned char *_upper = _keys + (_klen<<1) - 2;
621
+ while (1) {
622
+ if ( _upper < _lower )
623
+ break;
624
+
625
+ _mid = _lower + (((_upper-_lower) >> 1) & ~1);
626
+ if ( (*p) < _mid[0] )
627
+ _upper = _mid - 2;
628
+ else if ( (*p) > _mid[1] )
629
+ _lower = _mid + 2;
630
+ else {
631
+ _trans += ((_mid - _keys)>>1);
632
+ goto _match;
633
+ }
634
+ }
635
+ _trans += _klen;
636
+ }
637
+
638
+ _match:
639
+ _trans = _StdTok_indicies[_trans];
640
+ _eof_trans:
641
+ cs = _StdTok_trans_targs[_trans];
642
+
643
+ if ( _StdTok_trans_actions[_trans] == 0 )
644
+ goto _again;
645
+
646
+ _acts = _StdTok_actions + _StdTok_trans_actions[_trans];
647
+ _nacts = (unsigned int) *_acts++;
648
+ while ( _nacts-- > 0 )
649
+ {
650
+ switch ( *_acts++ )
651
+ {
652
+ case 0:
653
+ #line 14 "src/scanner.rl"
654
+ { skip = p - ts; }
655
+ break;
656
+ case 1:
657
+ #line 26 "src/scanner.rl"
658
+ { trunc = 1; }
659
+ break;
660
+ case 2:
661
+ #line 27 "src/scanner.rl"
662
+ { p--; {stack[top++] = cs; cs = 35; goto _again;} }
663
+ break;
664
+ case 5:
665
+ #line 1 "src/scanner.rl"
666
+ {te = p+1;}
667
+ break;
668
+ case 6:
669
+ #line 15 "src/scanner.rl"
670
+ {act = 2;}
671
+ break;
672
+ case 7:
673
+ #line 16 "src/scanner.rl"
674
+ {act = 3;}
675
+ break;
676
+ case 8:
677
+ #line 17 "src/scanner.rl"
678
+ {act = 4;}
679
+ break;
680
+ case 9:
681
+ #line 20 "src/scanner.rl"
682
+ {act = 5;}
683
+ break;
684
+ case 10:
685
+ #line 23 "src/scanner.rl"
686
+ {act = 6;}
687
+ break;
688
+ case 11:
689
+ #line 26 "src/scanner.rl"
690
+ {act = 7;}
691
+ break;
692
+ case 12:
693
+ #line 29 "src/scanner.rl"
694
+ {act = 8;}
695
+ break;
696
+ case 13:
697
+ #line 32 "src/scanner.rl"
698
+ {act = 9;}
699
+ break;
700
+ case 14:
701
+ #line 35 "src/scanner.rl"
702
+ {act = 10;}
703
+ break;
704
+ case 15:
705
+ #line 36 "src/scanner.rl"
706
+ {act = 11;}
707
+ break;
708
+ case 16:
709
+ #line 40 "src/scanner.rl"
710
+ {act = 13;}
711
+ break;
712
+ case 17:
713
+ #line 12 "src/scanner.rl"
714
+ {te = p+1;{ RET; }}
715
+ break;
716
+ case 18:
717
+ #line 29 "src/scanner.rl"
718
+ {te = p+1;{ RET; }}
719
+ break;
720
+ case 19:
721
+ #line 39 "src/scanner.rl"
722
+ {te = p+1;{ return; }}
723
+ break;
724
+ case 20:
725
+ #line 40 "src/scanner.rl"
726
+ {te = p+1;}
727
+ break;
728
+ case 21:
729
+ #line 12 "src/scanner.rl"
730
+ {te = p;p--;{ RET; }}
731
+ break;
732
+ case 22:
733
+ #line 15 "src/scanner.rl"
734
+ {te = p;p--;{ RET; }}
735
+ break;
736
+ case 23:
737
+ #line 16 "src/scanner.rl"
738
+ {te = p;p--;{ trunc = 1; RET; }}
739
+ break;
740
+ case 24:
741
+ #line 23 "src/scanner.rl"
742
+ {te = p;p--;{ RET; }}
743
+ break;
744
+ case 25:
745
+ #line 26 "src/scanner.rl"
746
+ {te = p;p--;{ RET; }}
747
+ break;
748
+ case 26:
749
+ #line 29 "src/scanner.rl"
750
+ {te = p;p--;{ RET; }}
751
+ break;
752
+ case 27:
753
+ #line 32 "src/scanner.rl"
754
+ {te = p;p--;{ STRIP('.'); }}
755
+ break;
756
+ case 28:
757
+ #line 35 "src/scanner.rl"
758
+ {te = p;p--;{ RET; }}
759
+ break;
760
+ case 29:
761
+ #line 36 "src/scanner.rl"
762
+ {te = p;p--;{ RET; }}
763
+ break;
764
+ case 30:
765
+ #line 40 "src/scanner.rl"
766
+ {te = p;p--;}
767
+ break;
768
+ case 31:
769
+ #line 12 "src/scanner.rl"
770
+ {{p = ((te))-1;}{ RET; }}
771
+ break;
772
+ case 32:
773
+ #line 15 "src/scanner.rl"
774
+ {{p = ((te))-1;}{ RET; }}
775
+ break;
776
+ case 33:
777
+ #line 23 "src/scanner.rl"
778
+ {{p = ((te))-1;}{ RET; }}
779
+ break;
780
+ case 34:
781
+ #line 29 "src/scanner.rl"
782
+ {{p = ((te))-1;}{ RET; }}
783
+ break;
784
+ case 35:
785
+ #line 35 "src/scanner.rl"
786
+ {{p = ((te))-1;}{ RET; }}
787
+ break;
788
+ case 36:
789
+ #line 40 "src/scanner.rl"
790
+ {{p = ((te))-1;}}
791
+ break;
792
+ case 37:
793
+ #line 1 "src/scanner.rl"
794
+ { switch( act ) {
795
+ case 2:
796
+ {{p = ((te))-1;} RET; }
797
+ break;
798
+ case 3:
799
+ {{p = ((te))-1;} trunc = 1; RET; }
800
+ break;
801
+ case 4:
802
+ {{p = ((te))-1;} trunc = 2; RET; }
803
+ break;
804
+ case 5:
805
+ {{p = ((te))-1;} RET; }
806
+ break;
807
+ case 6:
808
+ {{p = ((te))-1;} RET; }
809
+ break;
810
+ case 7:
811
+ {{p = ((te))-1;} RET; }
812
+ break;
813
+ case 8:
814
+ {{p = ((te))-1;} RET; }
815
+ break;
816
+ case 9:
817
+ {{p = ((te))-1;} STRIP('.'); }
818
+ break;
819
+ case 10:
820
+ {{p = ((te))-1;} RET; }
821
+ break;
822
+ case 11:
823
+ {{p = ((te))-1;} RET; }
824
+ break;
825
+ case 13:
826
+ {{p = ((te))-1;}}
827
+ break;
828
+ }
829
+ }
830
+ break;
831
+ #line 832 "src/scanner.c"
832
+ }
833
+ }
834
+
835
+ _again:
836
+ _acts = _StdTok_actions + _StdTok_to_state_actions[cs];
837
+ _nacts = (unsigned int) *_acts++;
838
+ while ( _nacts-- > 0 ) {
839
+ switch ( *_acts++ ) {
840
+ case 3:
841
+ #line 1 "src/scanner.rl"
842
+ {ts = 0;}
843
+ break;
844
+ #line 845 "src/scanner.c"
845
+ }
846
+ }
847
+
848
+ if ( cs == 0 )
849
+ goto _out;
850
+ if ( ++p != pe )
851
+ goto _resume;
852
+ _test_eof: {}
853
+ if ( p == eof )
854
+ {
855
+ if ( _StdTok_eof_trans[cs] > 0 ) {
856
+ _trans = _StdTok_eof_trans[cs] - 1;
857
+ goto _eof_trans;
858
+ }
859
+ }
860
+
861
+ _out: {}
862
+ }
863
+ #line 54 "src/scanner.rl"
864
+
865
+ if ( cs == StdTok_error )
866
+ fprintf(stderr, "PARSE ERROR\n" );
867
+ else if ( ts ) fprintf(stderr, "STUFF LEFT: '%s'\n", ts);
868
+ return;
869
+
870
+ ret:
871
+ {
872
+ size_t __len = te - ts - skip - trunc;
873
+ if (__len > out_size)
874
+ __len = out_size;
875
+
876
+ *start = ts;
877
+ *end = te;
878
+
879
+ if (strip_char) {
880
+ char *__p = ts + skip;
881
+ char *__o = out;
882
+ for (; __p < (ts + skip + __len); ++__p) {
883
+ if (*__p != strip_char)
884
+ *__o++ = *__p;
885
+ }
886
+ *token_size = __o - out;
887
+ }
888
+ else {
889
+ memcpy(out, ts + skip, __len);
890
+ *token_size = __len;
891
+ }
892
+
893
+ out[*token_size] = 0;
894
+ }
895
+ }
data/ext/scanner.in ADDED
@@ -0,0 +1,43 @@
1
+ #// scanner.in -*-C-*-
2
+
3
+ %%{
4
+ machine StdTok;
5
+ include URL "url.rl";
6
+ include Email "email.rl";
7
+
8
+ token = frt_alpha frt_alnum*;
9
+
10
+ frt_tokenizer := |*
11
+ #// Email
12
+ email { RET; };
13
+
14
+ #// Token, or token with possessive
15
+ token { RET; };
16
+ token [\'] { trunc = 1; RET; };
17
+ token [\'][sS] { trunc = 2; RET; };
18
+
19
+ #// contractions
20
+ frt_alpha+ [\'] frt_alpha+ { RET; };
21
+
22
+ #// Token with hyphens
23
+ frt_alnum+ ([\-_] frt_alnum+)* { RET; };
24
+
25
+ #// Company name
26
+ token [\&\@] token* { RET; };
27
+
28
+ #// URL
29
+ url { RET; };
30
+
31
+ #// Acronym
32
+ (frt_alpha '.')+ frt_alpha { STRIP('.'); };
33
+
34
+ #// Int+float
35
+ [\-\+]?frt_digit+ { RET; };
36
+ [\-\+]?frt_digit+ '.' frt_digit+ { RET; };
37
+
38
+ #// Ignore whitespace and other crap
39
+ 0 { return; };
40
+ (any - frt_alnum) {};
41
+
42
+ *|;
43
+ }%%
data/ext/scanner.rl ADDED
@@ -0,0 +1,85 @@
1
+ /* scanner.rl -*-C-*- */
2
+ #include <ctype.h>
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+ #include <string.h>
6
+ #include <unistd.h>
7
+ /* #include "global.h" */
8
+ /* #include "internal.h" */
9
+
10
+ #define RET goto ret;
11
+
12
+ #define STRIP(c) do { \
13
+ strip_char = c; \
14
+ goto ret; \
15
+ } while(0)
16
+
17
+ %%{
18
+ machine StdTok;
19
+ alphtype unsigned char;
20
+
21
+ frt_alpha = alpha;
22
+ frt_alnum = alnum;
23
+ frt_digit = digit;
24
+
25
+ include StdTok "scanner.in";
26
+
27
+ main := any @{ fhold; fcall frt_tokenizer; };
28
+ }%%
29
+
30
+ %% write data nofinal;
31
+
32
+ void frt_std_scan(const char *in,
33
+ char *out, size_t out_size,
34
+ const char **start,
35
+ const char **end,
36
+ int *token_size)
37
+ {
38
+ int cs, act, top;
39
+ int stack[32];
40
+ char *ts = 0, *te = 0;
41
+
42
+ %% write init;
43
+
44
+ char *p = (char *)in, *pe = 0, *eof = pe;
45
+ int skip = 0;
46
+ int trunc = 0;
47
+ char strip_char = 0;
48
+
49
+ *end = 0;
50
+ *start = 0;
51
+ *token_size = 0;
52
+
53
+ %% write exec;
54
+
55
+ if ( cs == StdTok_error )
56
+ fprintf(stderr, "PARSE ERROR\n" );
57
+ else if ( ts ) fprintf(stderr, "STUFF LEFT: '%s'\n", ts);
58
+ return;
59
+
60
+ ret:
61
+ {
62
+ size_t __len = te - ts - skip - trunc;
63
+ if (__len > out_size)
64
+ __len = out_size;
65
+
66
+ *start = ts;
67
+ *end = te;
68
+
69
+ if (strip_char) {
70
+ char *__p = ts + skip;
71
+ char *__o = out;
72
+ for (; __p < (ts + skip + __len); ++__p) {
73
+ if (*__p != strip_char)
74
+ *__o++ = *__p;
75
+ }
76
+ *token_size = __o - out;
77
+ }
78
+ else {
79
+ memcpy(out, ts + skip, __len);
80
+ *token_size = __len;
81
+ }
82
+
83
+ out[*token_size] = 0;
84
+ }
85
+ }
data/ext/url.rl ADDED
@@ -0,0 +1,27 @@
1
+ #// url.rl -*-C-*-
2
+
3
+ %%{
4
+ machine URL;
5
+
6
+ uword = [_] | alnum;
7
+ dword = '-' | uword;
8
+ dalnum = '-' | alnum;
9
+ proto = 'http'[s]? | 'ftp' | 'file';
10
+ urlc = alnum | [.,\/_\-\@\:];
11
+
12
+ url =
13
+ (
14
+ proto [:][/]+ %{ skip = p - ts; } dword+ ([.] uword dword*)+ |
15
+ alnum+ [:][/]+ urlc+ |
16
+
17
+ (alnum (dalnum* alnum)? [.])+ #// Subdomains
18
+ ('com' |'edu' | 'biz' | 'gov' |
19
+ 'int' | 'info' | 'mil' | 'net' |
20
+ 'org' | alpha{2})
21
+ )
22
+
23
+ #// Port
24
+ ( [:] digit+ )?
25
+
26
+ ([/]? @{ trunc = 1; });
27
+ }%%
data/test/test.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'ext/ferret_tokenizer'
2
+
3
+ t = FerretTokenizer.new '@feedbackmine sh*t, I LOVE this!!! so funny:-) :('
4
+ while (tok = t.next)
5
+ p tok
6
+ end
7
+
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sishen-ferret_tokenizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - FeedbackMine
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-07-28 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.8.3
24
+ version:
25
+ description: A string tokenizer based on Ferret::Analysis::StandardAnalyzer.
26
+ email: feedbackmine@feedbackmine.com
27
+ executables: []
28
+
29
+ extensions:
30
+ - ext/extconf.rb
31
+ extra_rdoc_files:
32
+ - Manifest.txt
33
+ files:
34
+ - ext/ferret_tokenizer.c
35
+ - ext/scanner.rl
36
+ - ext/url.rl
37
+ - ext/extconf.rb
38
+ - ext/scanner.c
39
+ - ext/email.rl
40
+ - ext/scanner.in
41
+ - Rakefile
42
+ - Manifest.txt
43
+ - README.txt
44
+ - test
45
+ - test/test.rb
46
+ has_rdoc: true
47
+ homepage: http://www.tweetjobsearch.com
48
+ licenses:
49
+ post_install_message:
50
+ rdoc_options:
51
+ - --main
52
+ - README.txt
53
+ require_paths:
54
+ - ext
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: "0"
60
+ version:
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: "0"
66
+ version:
67
+ requirements: []
68
+
69
+ rubyforge_project: tokenizer
70
+ rubygems_version: 1.3.5
71
+ signing_key:
72
+ specification_version: 2
73
+ summary: tokenizer
74
+ test_files: []
75
+