github-linguist 5.3.1 → 5.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,353 @@
1
+ #ifndef linguist_yyHEADER_H
2
+ #define linguist_yyHEADER_H 1
3
+ #define linguist_yyIN_HEADER 1
4
+
5
+ #line 6 "lex.linguist_yy.h"
6
+
7
+ #define YY_INT_ALIGNED short int
8
+
9
+ /* A lexical scanner generated by flex */
10
+
11
+ #define FLEX_SCANNER
12
+ #define YY_FLEX_MAJOR_VERSION 2
13
+ #define YY_FLEX_MINOR_VERSION 5
14
+ #define YY_FLEX_SUBMINOR_VERSION 39
15
+ #if YY_FLEX_SUBMINOR_VERSION > 0
16
+ #define FLEX_BETA
17
+ #endif
18
+
19
+ /* First, we deal with platform-specific or compiler-specific issues. */
20
+
21
+ /* begin standard C headers. */
22
+ #include <stdio.h>
23
+ #include <string.h>
24
+ #include <errno.h>
25
+ #include <stdlib.h>
26
+
27
+ /* end standard C headers. */
28
+
29
+ /* flex integer type definitions */
30
+
31
+ #ifndef FLEXINT_H
32
+ #define FLEXINT_H
33
+
34
+ /* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
35
+
36
+ #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
37
+
38
+ /* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
39
+ * if you want the limit (max/min) macros for int types.
40
+ */
41
+ #ifndef __STDC_LIMIT_MACROS
42
+ #define __STDC_LIMIT_MACROS 1
43
+ #endif
44
+
45
+ #include <inttypes.h>
46
+ typedef int8_t flex_int8_t;
47
+ typedef uint8_t flex_uint8_t;
48
+ typedef int16_t flex_int16_t;
49
+ typedef uint16_t flex_uint16_t;
50
+ typedef int32_t flex_int32_t;
51
+ typedef uint32_t flex_uint32_t;
52
+ #else
53
+ typedef signed char flex_int8_t;
54
+ typedef short int flex_int16_t;
55
+ typedef int flex_int32_t;
56
+ typedef unsigned char flex_uint8_t;
57
+ typedef unsigned short int flex_uint16_t;
58
+ typedef unsigned int flex_uint32_t;
59
+
60
+ /* Limits of integral types. */
61
+ #ifndef INT8_MIN
62
+ #define INT8_MIN (-128)
63
+ #endif
64
+ #ifndef INT16_MIN
65
+ #define INT16_MIN (-32767-1)
66
+ #endif
67
+ #ifndef INT32_MIN
68
+ #define INT32_MIN (-2147483647-1)
69
+ #endif
70
+ #ifndef INT8_MAX
71
+ #define INT8_MAX (127)
72
+ #endif
73
+ #ifndef INT16_MAX
74
+ #define INT16_MAX (32767)
75
+ #endif
76
+ #ifndef INT32_MAX
77
+ #define INT32_MAX (2147483647)
78
+ #endif
79
+ #ifndef UINT8_MAX
80
+ #define UINT8_MAX (255U)
81
+ #endif
82
+ #ifndef UINT16_MAX
83
+ #define UINT16_MAX (65535U)
84
+ #endif
85
+ #ifndef UINT32_MAX
86
+ #define UINT32_MAX (4294967295U)
87
+ #endif
88
+
89
+ #endif /* ! C99 */
90
+
91
+ #endif /* ! FLEXINT_H */
92
+
93
+ #ifdef __cplusplus
94
+
95
+ /* The "const" storage-class-modifier is valid. */
96
+ #define YY_USE_CONST
97
+
98
+ #else /* ! __cplusplus */
99
+
100
+ /* C99 requires __STDC__ to be defined as 1. */
101
+ #if defined (__STDC__)
102
+
103
+ #define YY_USE_CONST
104
+
105
+ #endif /* defined (__STDC__) */
106
+ #endif /* ! __cplusplus */
107
+
108
+ #ifdef YY_USE_CONST
109
+ #define yyconst const
110
+ #else
111
+ #define yyconst
112
+ #endif
113
+
114
+ /* An opaque pointer. */
115
+ #ifndef YY_TYPEDEF_YY_SCANNER_T
116
+ #define YY_TYPEDEF_YY_SCANNER_T
117
+ typedef void* yyscan_t;
118
+ #endif
119
+
120
+ /* For convenience, these vars (plus the bison vars far below)
121
+ are macros in the reentrant scanner. */
122
+ #define yyin yyg->yyin_r
123
+ #define yyout yyg->yyout_r
124
+ #define yyextra yyg->yyextra_r
125
+ #define yyleng yyg->yyleng_r
126
+ #define yytext yyg->yytext_r
127
+ #define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
128
+ #define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
129
+ #define yy_flex_debug yyg->yy_flex_debug_r
130
+
131
+ /* Size of default input buffer. */
132
+ #ifndef YY_BUF_SIZE
133
+ #ifdef __ia64__
134
+ /* On IA-64, the buffer size is 16k, not 8k.
135
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
136
+ * Ditto for the __ia64__ case accordingly.
137
+ */
138
+ #define YY_BUF_SIZE 32768
139
+ #else
140
+ #define YY_BUF_SIZE 16384
141
+ #endif /* __ia64__ */
142
+ #endif
143
+
144
+ #ifndef YY_TYPEDEF_YY_BUFFER_STATE
145
+ #define YY_TYPEDEF_YY_BUFFER_STATE
146
+ typedef struct yy_buffer_state *YY_BUFFER_STATE;
147
+ #endif
148
+
149
+ #ifndef YY_TYPEDEF_YY_SIZE_T
150
+ #define YY_TYPEDEF_YY_SIZE_T
151
+ typedef size_t yy_size_t;
152
+ #endif
153
+
154
+ #ifndef YY_STRUCT_YY_BUFFER_STATE
155
+ #define YY_STRUCT_YY_BUFFER_STATE
156
+ struct yy_buffer_state
157
+ {
158
+ FILE *yy_input_file;
159
+
160
+ char *yy_ch_buf; /* input buffer */
161
+ char *yy_buf_pos; /* current position in input buffer */
162
+
163
+ /* Size of input buffer in bytes, not including room for EOB
164
+ * characters.
165
+ */
166
+ yy_size_t yy_buf_size;
167
+
168
+ /* Number of characters read into yy_ch_buf, not including EOB
169
+ * characters.
170
+ */
171
+ yy_size_t yy_n_chars;
172
+
173
+ /* Whether we "own" the buffer - i.e., we know we created it,
174
+ * and can realloc() it to grow it, and should free() it to
175
+ * delete it.
176
+ */
177
+ int yy_is_our_buffer;
178
+
179
+ /* Whether this is an "interactive" input source; if so, and
180
+ * if we're using stdio for input, then we want to use getc()
181
+ * instead of fread(), to make sure we stop fetching input after
182
+ * each newline.
183
+ */
184
+ int yy_is_interactive;
185
+
186
+ /* Whether we're considered to be at the beginning of a line.
187
+ * If so, '^' rules will be active on the next match, otherwise
188
+ * not.
189
+ */
190
+ int yy_at_bol;
191
+
192
+ int yy_bs_lineno; /**< The line count. */
193
+ int yy_bs_column; /**< The column count. */
194
+
195
+ /* Whether to try to fill the input buffer when we reach the
196
+ * end of it.
197
+ */
198
+ int yy_fill_buffer;
199
+
200
+ int yy_buffer_status;
201
+
202
+ };
203
+ #endif /* !YY_STRUCT_YY_BUFFER_STATE */
204
+
205
+ void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner );
206
+ void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
207
+ YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
208
+ void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
209
+ void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
210
+ void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
211
+ void linguist_yypop_buffer_state (yyscan_t yyscanner );
212
+
213
+ YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
214
+ YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
215
+ YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner );
216
+
217
+ void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner );
218
+ void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner );
219
+ void linguist_yyfree (void * ,yyscan_t yyscanner );
220
+
221
+ /* Begin user sect3 */
222
+
223
+ #define yytext_ptr yytext_r
224
+
225
+ #ifdef YY_HEADER_EXPORT_START_CONDITIONS
226
+ #define INITIAL 0
227
+ #define sgml 1
228
+ #define c_comment 2
229
+ #define xml_comment 3
230
+ #define haskell_comment 4
231
+ #define ocaml_comment 5
232
+ #define python_dcomment 6
233
+ #define python_scomment 7
234
+
235
+ #endif
236
+
237
+ #ifndef YY_NO_UNISTD_H
238
+ /* Special case for "unistd.h", since it is non-ANSI. We include it way
239
+ * down here because we want the user's section 1 to have been scanned first.
240
+ * The user has a chance to override it with an option.
241
+ */
242
+ #include <unistd.h>
243
+ #endif
244
+
245
+ #define YY_EXTRA_TYPE struct tokenizer_extra *
246
+
247
+ int linguist_yylex_init (yyscan_t* scanner);
248
+
249
+ int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
250
+
251
+ /* Accessor methods to globals.
252
+ These are made visible to non-reentrant scanners for convenience. */
253
+
254
+ int linguist_yylex_destroy (yyscan_t yyscanner );
255
+
256
+ int linguist_yyget_debug (yyscan_t yyscanner );
257
+
258
+ void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner );
259
+
260
+ YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner );
261
+
262
+ void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
263
+
264
+ FILE *linguist_yyget_in (yyscan_t yyscanner );
265
+
266
+ void linguist_yyset_in (FILE * in_str ,yyscan_t yyscanner );
267
+
268
+ FILE *linguist_yyget_out (yyscan_t yyscanner );
269
+
270
+ void linguist_yyset_out (FILE * out_str ,yyscan_t yyscanner );
271
+
272
+ yy_size_t linguist_yyget_leng (yyscan_t yyscanner );
273
+
274
+ char *linguist_yyget_text (yyscan_t yyscanner );
275
+
276
+ int linguist_yyget_lineno (yyscan_t yyscanner );
277
+
278
+ void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
279
+
280
+ int linguist_yyget_column (yyscan_t yyscanner );
281
+
282
+ void linguist_yyset_column (int column_no ,yyscan_t yyscanner );
283
+
284
+ /* Macros after this point can all be overridden by user definitions in
285
+ * section 1.
286
+ */
287
+
288
+ #ifndef YY_SKIP_YYWRAP
289
+ #ifdef __cplusplus
290
+ extern "C" int linguist_yywrap (yyscan_t yyscanner );
291
+ #else
292
+ extern int linguist_yywrap (yyscan_t yyscanner );
293
+ #endif
294
+ #endif
295
+
296
+ #ifndef yytext_ptr
297
+ static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
298
+ #endif
299
+
300
+ #ifdef YY_NEED_STRLEN
301
+ static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
302
+ #endif
303
+
304
+ #ifndef YY_NO_INPUT
305
+
306
+ #endif
307
+
308
+ /* Amount of stuff to slurp up with each read. */
309
+ #ifndef YY_READ_BUF_SIZE
310
+ #ifdef __ia64__
311
+ /* On IA-64, the buffer size is 16k, not 8k */
312
+ #define YY_READ_BUF_SIZE 16384
313
+ #else
314
+ #define YY_READ_BUF_SIZE 8192
315
+ #endif /* __ia64__ */
316
+ #endif
317
+
318
+ /* Number of entries by which start-condition stack grows. */
319
+ #ifndef YY_START_STACK_INCR
320
+ #define YY_START_STACK_INCR 25
321
+ #endif
322
+
323
+ /* Default declaration of generated scanner - a define so the user can
324
+ * easily add parameters.
325
+ */
326
+ #ifndef YY_DECL
327
+ #define YY_DECL_IS_OURS 1
328
+
329
+ extern int linguist_yylex (yyscan_t yyscanner);
330
+
331
+ #define YY_DECL int linguist_yylex (yyscan_t yyscanner)
332
+ #endif /* !YY_DECL */
333
+
334
+ /* yy_get_previous_state - get the state just before the EOB char was reached */
335
+
336
+ #undef YY_NEW_FILE
337
+ #undef YY_FLUSH_BUFFER
338
+ #undef yy_set_bol
339
+ #undef yy_new_buffer
340
+ #undef yy_set_interactive
341
+ #undef YY_DO_BEFORE_ACTION
342
+
343
+ #ifdef YY_DECL_IS_OURS
344
+ #undef YY_DECL_IS_OURS
345
+ #undef YY_DECL
346
+ #endif
347
+
348
+ #line 117 "tokenizer.l"
349
+
350
+
351
+ #line 352 "lex.linguist_yy.h"
352
+ #undef linguist_yyIN_HEADER
353
+ #endif /* linguist_yyHEADER_H */
@@ -0,0 +1,64 @@
1
+ #include "ruby.h"
2
+ #include "linguist.h"
3
+ #include "lex.linguist_yy.h"
4
+
5
+ int linguist_yywrap(yyscan_t yyscanner) {
6
+ return 1;
7
+ }
8
+
9
+ static VALUE rb_tokenizer_extract_tokens(VALUE self, VALUE rb_data) {
10
+ YY_BUFFER_STATE buf;
11
+ yyscan_t scanner;
12
+ struct tokenizer_extra extra;
13
+ VALUE ary, s;
14
+ long len;
15
+ int r;
16
+
17
+ Check_Type(rb_data, T_STRING);
18
+
19
+ len = RSTRING_LEN(rb_data);
20
+ if (len > 100000)
21
+ len = 100000;
22
+
23
+ linguist_yylex_init_extra(&extra, &scanner);
24
+ buf = linguist_yy_scan_bytes(RSTRING_PTR(rb_data), (int) len, scanner);
25
+
26
+ ary = rb_ary_new();
27
+ do {
28
+ extra.type = NO_ACTION;
29
+ extra.token = NULL;
30
+ r = linguist_yylex(scanner);
31
+ switch (extra.type) {
32
+ case NO_ACTION:
33
+ break;
34
+ case REGULAR_TOKEN:
35
+ rb_ary_push(ary, rb_str_new2(extra.token));
36
+ free(extra.token);
37
+ break;
38
+ case SHEBANG_TOKEN:
39
+ s = rb_str_new2("SHEBANG#!");
40
+ rb_str_cat2(s, extra.token);
41
+ rb_ary_push(ary, s);
42
+ free(extra.token);
43
+ break;
44
+ case SGML_TOKEN:
45
+ s = rb_str_new2(extra.token);
46
+ rb_str_cat2(s, ">");
47
+ rb_ary_push(ary, s);
48
+ free(extra.token);
49
+ break;
50
+ }
51
+ } while (r);
52
+
53
+ linguist_yy_delete_buffer(buf, scanner);
54
+ linguist_yylex_destroy(scanner);
55
+
56
+ return ary;
57
+ }
58
+
59
+ __attribute__((visibility("default"))) void Init_linguist() {
60
+ VALUE rb_mLinguist = rb_define_module("Linguist");
61
+ VALUE rb_cTokenizer = rb_define_class_under(rb_mLinguist, "Tokenizer", rb_cObject);
62
+
63
+ rb_define_method(rb_cTokenizer, "extract_tokens", rb_tokenizer_extract_tokens, 1);
64
+ }
@@ -0,0 +1,11 @@
1
+ enum tokenizer_type {
2
+ NO_ACTION,
3
+ REGULAR_TOKEN,
4
+ SHEBANG_TOKEN,
5
+ SGML_TOKEN,
6
+ };
7
+
8
+ struct tokenizer_extra {
9
+ char *token;
10
+ enum tokenizer_type type;
11
+ };
@@ -0,0 +1,119 @@
1
+ %{
2
+
3
+ #include "linguist.h"
4
+
5
+ #define feed_token(tok, typ) do { \
6
+ yyextra->token = (tok); \
7
+ yyextra->type = (typ); \
8
+ } while (0)
9
+
10
+ #define eat_until_eol() do { \
11
+ int c; \
12
+ while ((c = input(yyscanner)) != '\n' && c != EOF); \
13
+ if (c == EOF) \
14
+ yyterminate(); \
15
+ } while (0)
16
+
17
+ #define eat_until_unescaped(q) do { \
18
+ int c; \
19
+ while ((c = input(yyscanner)) != EOF) { \
20
+ if (c == '\n') \
21
+ break; \
22
+ if (c == '\\') { \
23
+ c = input(yyscanner); \
24
+ if (c == EOF) \
25
+ yyterminate(); \
26
+ } else if (c == q) \
27
+ break; \
28
+ } \
29
+ if (c == EOF) \
30
+ yyterminate(); \
31
+ } while (0)
32
+
33
+ %}
34
+
35
+ %option never-interactive yywrap reentrant nounput warn nodefault header-file="lex.linguist_yy.h" extra-type="struct tokenizer_extra *" prefix="linguist_yy"
36
+ %x sgml c_comment xml_comment haskell_comment ocaml_comment python_dcomment python_scomment
37
+
38
+ %%
39
+
40
+ ^#![ \t]*([[:alnum:]_\/]*\/)?env([ \t]+([^ \t=]*=[^ \t]*))*[ \t]+[[:alpha:]_]+ {
41
+ const char *off = strrchr(yytext, ' ');
42
+ if (!off)
43
+ off = yytext;
44
+ else
45
+ ++off;
46
+ feed_token(strdup(off), SHEBANG_TOKEN);
47
+ eat_until_eol();
48
+ return 1;
49
+ }
50
+
51
+ ^#![ \t]*[[:alpha:]_\/]+ {
52
+ const char *off = strrchr(yytext, '/');
53
+ if (!off)
54
+ off = yytext;
55
+ else
56
+ ++off;
57
+ if (strcmp(off, "env") == 0) {
58
+ eat_until_eol();
59
+ } else {
60
+ feed_token(strdup(off), SHEBANG_TOKEN);
61
+ eat_until_eol();
62
+ return 1;
63
+ }
64
+ }
65
+
66
+ ^[ \t]*(\/\/|--|\#|%|\")" ".* { /* nothing */ }
67
+
68
+ "/*" { BEGIN(c_comment); }
69
+ /* See below for xml_comment start. */
70
+ "{-" { BEGIN(haskell_comment); }
71
+ "(*" { BEGIN(ocaml_comment); }
72
+ "\"\"\"" { BEGIN(python_dcomment); }
73
+ "'''" { BEGIN(python_scomment); }
74
+
75
+ <c_comment,xml_comment,haskell_comment,ocaml_comment,python_dcomment,python_scomment>.|\n { /* nothing */ }
76
+ <c_comment>"*/" { BEGIN(INITIAL); }
77
+ <xml_comment>"-->" { BEGIN(INITIAL); }
78
+ <haskell_comment>"-}" { BEGIN(INITIAL); }
79
+ <ocaml_comment>"*)" { BEGIN(INITIAL); }
80
+ <python_dcomment>"\"\"\"" { BEGIN(INITIAL); }
81
+ <python_scomment>"'''" { BEGIN(INITIAL); }
82
+
83
+ \"\"|'' { /* nothing */ }
84
+ \" { eat_until_unescaped('"'); }
85
+ ' { eat_until_unescaped('\''); }
86
+ (0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ }
87
+ \<[^ \t\n\r<>]+/>|" "[^<>\n]{0,2048}> {
88
+ if (strcmp(yytext, "<!--") == 0) {
89
+ BEGIN(xml_comment);
90
+ } else {
91
+ feed_token(strdup(yytext), SGML_TOKEN);
92
+ BEGIN(sgml);
93
+ return 1;
94
+ }
95
+ }
96
+ <sgml>[[:alnum:]_]+=/\" { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('"'); return 1; }
97
+ <sgml>[[:alnum:]_]+=/' { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('\''); return 1; }
98
+ <sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; }
99
+ <sgml>[[:alnum:]_]+ { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
100
+ <sgml>\> { BEGIN(INITIAL); }
101
+ <sgml>.|\n { /* nothing */ }
102
+ ;|\{|\}|\(|\)|\[|\] { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
103
+ [[:alnum:]_.@#/*]+ {
104
+ if (strncmp(yytext, "/*", 2) == 0) {
105
+ if (strlen(yytext) >= 4 && strcmp(yytext + strlen(yytext) - 2, "*/") == 0) {
106
+ /* nothing */
107
+ } else {
108
+ BEGIN(c_comment);
109
+ }
110
+ } else {
111
+ feed_token(strdup(yytext), REGULAR_TOKEN);
112
+ return 1;
113
+ }
114
+ }
115
+ \<\<?|\+|\-|\*|\/|%|&&?|\|\|? { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
116
+ .|\n { /* nothing */ }
117
+
118
+ %%
119
+