github-linguist 5.3.1 → 5.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,353 @@
1
+ #ifndef linguist_yyHEADER_H
2
+ #define linguist_yyHEADER_H 1
3
+ #define linguist_yyIN_HEADER 1
4
+
5
+ #line 6 "lex.linguist_yy.h"
6
+
7
+ #define YY_INT_ALIGNED short int
8
+
9
+ /* A lexical scanner generated by flex */
10
+
11
+ #define FLEX_SCANNER
12
+ #define YY_FLEX_MAJOR_VERSION 2
13
+ #define YY_FLEX_MINOR_VERSION 5
14
+ #define YY_FLEX_SUBMINOR_VERSION 39
15
+ #if YY_FLEX_SUBMINOR_VERSION > 0
16
+ #define FLEX_BETA
17
+ #endif
18
+
19
+ /* First, we deal with platform-specific or compiler-specific issues. */
20
+
21
+ /* begin standard C headers. */
22
+ #include <stdio.h>
23
+ #include <string.h>
24
+ #include <errno.h>
25
+ #include <stdlib.h>
26
+
27
+ /* end standard C headers. */
28
+
29
+ /* flex integer type definitions */
30
+
31
+ #ifndef FLEXINT_H
32
+ #define FLEXINT_H
33
+
34
+ /* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
35
+
36
+ #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
37
+
38
+ /* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
39
+ * if you want the limit (max/min) macros for int types.
40
+ */
41
+ #ifndef __STDC_LIMIT_MACROS
42
+ #define __STDC_LIMIT_MACROS 1
43
+ #endif
44
+
45
+ #include <inttypes.h>
46
+ typedef int8_t flex_int8_t;
47
+ typedef uint8_t flex_uint8_t;
48
+ typedef int16_t flex_int16_t;
49
+ typedef uint16_t flex_uint16_t;
50
+ typedef int32_t flex_int32_t;
51
+ typedef uint32_t flex_uint32_t;
52
+ #else
53
+ typedef signed char flex_int8_t;
54
+ typedef short int flex_int16_t;
55
+ typedef int flex_int32_t;
56
+ typedef unsigned char flex_uint8_t;
57
+ typedef unsigned short int flex_uint16_t;
58
+ typedef unsigned int flex_uint32_t;
59
+
60
+ /* Limits of integral types. */
61
+ #ifndef INT8_MIN
62
+ #define INT8_MIN (-128)
63
+ #endif
64
+ #ifndef INT16_MIN
65
+ #define INT16_MIN (-32767-1)
66
+ #endif
67
+ #ifndef INT32_MIN
68
+ #define INT32_MIN (-2147483647-1)
69
+ #endif
70
+ #ifndef INT8_MAX
71
+ #define INT8_MAX (127)
72
+ #endif
73
+ #ifndef INT16_MAX
74
+ #define INT16_MAX (32767)
75
+ #endif
76
+ #ifndef INT32_MAX
77
+ #define INT32_MAX (2147483647)
78
+ #endif
79
+ #ifndef UINT8_MAX
80
+ #define UINT8_MAX (255U)
81
+ #endif
82
+ #ifndef UINT16_MAX
83
+ #define UINT16_MAX (65535U)
84
+ #endif
85
+ #ifndef UINT32_MAX
86
+ #define UINT32_MAX (4294967295U)
87
+ #endif
88
+
89
+ #endif /* ! C99 */
90
+
91
+ #endif /* ! FLEXINT_H */
92
+
93
+ #ifdef __cplusplus
94
+
95
+ /* The "const" storage-class-modifier is valid. */
96
+ #define YY_USE_CONST
97
+
98
+ #else /* ! __cplusplus */
99
+
100
+ /* C99 requires __STDC__ to be defined as 1. */
101
+ #if defined (__STDC__)
102
+
103
+ #define YY_USE_CONST
104
+
105
+ #endif /* defined (__STDC__) */
106
+ #endif /* ! __cplusplus */
107
+
108
+ #ifdef YY_USE_CONST
109
+ #define yyconst const
110
+ #else
111
+ #define yyconst
112
+ #endif
113
+
114
+ /* An opaque pointer. */
115
+ #ifndef YY_TYPEDEF_YY_SCANNER_T
116
+ #define YY_TYPEDEF_YY_SCANNER_T
117
+ typedef void* yyscan_t;
118
+ #endif
119
+
120
+ /* For convenience, these vars (plus the bison vars far below)
121
+ are macros in the reentrant scanner. */
122
+ #define yyin yyg->yyin_r
123
+ #define yyout yyg->yyout_r
124
+ #define yyextra yyg->yyextra_r
125
+ #define yyleng yyg->yyleng_r
126
+ #define yytext yyg->yytext_r
127
+ #define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
128
+ #define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
129
+ #define yy_flex_debug yyg->yy_flex_debug_r
130
+
131
+ /* Size of default input buffer. */
132
+ #ifndef YY_BUF_SIZE
133
+ #ifdef __ia64__
134
+ /* On IA-64, the buffer size is 16k, not 8k.
135
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
136
+ * Ditto for the __ia64__ case accordingly.
137
+ */
138
+ #define YY_BUF_SIZE 32768
139
+ #else
140
+ #define YY_BUF_SIZE 16384
141
+ #endif /* __ia64__ */
142
+ #endif
143
+
144
+ #ifndef YY_TYPEDEF_YY_BUFFER_STATE
145
+ #define YY_TYPEDEF_YY_BUFFER_STATE
146
+ typedef struct yy_buffer_state *YY_BUFFER_STATE;
147
+ #endif
148
+
149
+ #ifndef YY_TYPEDEF_YY_SIZE_T
150
+ #define YY_TYPEDEF_YY_SIZE_T
151
+ typedef size_t yy_size_t;
152
+ #endif
153
+
154
+ #ifndef YY_STRUCT_YY_BUFFER_STATE
155
+ #define YY_STRUCT_YY_BUFFER_STATE
156
+ struct yy_buffer_state
157
+ {
158
+ FILE *yy_input_file;
159
+
160
+ char *yy_ch_buf; /* input buffer */
161
+ char *yy_buf_pos; /* current position in input buffer */
162
+
163
+ /* Size of input buffer in bytes, not including room for EOB
164
+ * characters.
165
+ */
166
+ yy_size_t yy_buf_size;
167
+
168
+ /* Number of characters read into yy_ch_buf, not including EOB
169
+ * characters.
170
+ */
171
+ yy_size_t yy_n_chars;
172
+
173
+ /* Whether we "own" the buffer - i.e., we know we created it,
174
+ * and can realloc() it to grow it, and should free() it to
175
+ * delete it.
176
+ */
177
+ int yy_is_our_buffer;
178
+
179
+ /* Whether this is an "interactive" input source; if so, and
180
+ * if we're using stdio for input, then we want to use getc()
181
+ * instead of fread(), to make sure we stop fetching input after
182
+ * each newline.
183
+ */
184
+ int yy_is_interactive;
185
+
186
+ /* Whether we're considered to be at the beginning of a line.
187
+ * If so, '^' rules will be active on the next match, otherwise
188
+ * not.
189
+ */
190
+ int yy_at_bol;
191
+
192
+ int yy_bs_lineno; /**< The line count. */
193
+ int yy_bs_column; /**< The column count. */
194
+
195
+ /* Whether to try to fill the input buffer when we reach the
196
+ * end of it.
197
+ */
198
+ int yy_fill_buffer;
199
+
200
+ int yy_buffer_status;
201
+
202
+ };
203
+ #endif /* !YY_STRUCT_YY_BUFFER_STATE */
204
+
205
+ void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner );
206
+ void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
207
+ YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
208
+ void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
209
+ void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
210
+ void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
211
+ void linguist_yypop_buffer_state (yyscan_t yyscanner );
212
+
213
+ YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
214
+ YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
215
+ YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner );
216
+
217
+ void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner );
218
+ void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner );
219
+ void linguist_yyfree (void * ,yyscan_t yyscanner );
220
+
221
+ /* Begin user sect3 */
222
+
223
+ #define yytext_ptr yytext_r
224
+
225
+ #ifdef YY_HEADER_EXPORT_START_CONDITIONS
226
+ #define INITIAL 0
227
+ #define sgml 1
228
+ #define c_comment 2
229
+ #define xml_comment 3
230
+ #define haskell_comment 4
231
+ #define ocaml_comment 5
232
+ #define python_dcomment 6
233
+ #define python_scomment 7
234
+
235
+ #endif
236
+
237
+ #ifndef YY_NO_UNISTD_H
238
+ /* Special case for "unistd.h", since it is non-ANSI. We include it way
239
+ * down here because we want the user's section 1 to have been scanned first.
240
+ * The user has a chance to override it with an option.
241
+ */
242
+ #include <unistd.h>
243
+ #endif
244
+
245
+ #define YY_EXTRA_TYPE struct tokenizer_extra *
246
+
247
+ int linguist_yylex_init (yyscan_t* scanner);
248
+
249
+ int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
250
+
251
+ /* Accessor methods to globals.
252
+ These are made visible to non-reentrant scanners for convenience. */
253
+
254
+ int linguist_yylex_destroy (yyscan_t yyscanner );
255
+
256
+ int linguist_yyget_debug (yyscan_t yyscanner );
257
+
258
+ void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner );
259
+
260
+ YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner );
261
+
262
+ void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
263
+
264
+ FILE *linguist_yyget_in (yyscan_t yyscanner );
265
+
266
+ void linguist_yyset_in (FILE * in_str ,yyscan_t yyscanner );
267
+
268
+ FILE *linguist_yyget_out (yyscan_t yyscanner );
269
+
270
+ void linguist_yyset_out (FILE * out_str ,yyscan_t yyscanner );
271
+
272
+ yy_size_t linguist_yyget_leng (yyscan_t yyscanner );
273
+
274
+ char *linguist_yyget_text (yyscan_t yyscanner );
275
+
276
+ int linguist_yyget_lineno (yyscan_t yyscanner );
277
+
278
+ void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
279
+
280
+ int linguist_yyget_column (yyscan_t yyscanner );
281
+
282
+ void linguist_yyset_column (int column_no ,yyscan_t yyscanner );
283
+
284
+ /* Macros after this point can all be overridden by user definitions in
285
+ * section 1.
286
+ */
287
+
288
+ #ifndef YY_SKIP_YYWRAP
289
+ #ifdef __cplusplus
290
+ extern "C" int linguist_yywrap (yyscan_t yyscanner );
291
+ #else
292
+ extern int linguist_yywrap (yyscan_t yyscanner );
293
+ #endif
294
+ #endif
295
+
296
+ #ifndef yytext_ptr
297
+ static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
298
+ #endif
299
+
300
+ #ifdef YY_NEED_STRLEN
301
+ static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
302
+ #endif
303
+
304
+ #ifndef YY_NO_INPUT
305
+
306
+ #endif
307
+
308
+ /* Amount of stuff to slurp up with each read. */
309
+ #ifndef YY_READ_BUF_SIZE
310
+ #ifdef __ia64__
311
+ /* On IA-64, the buffer size is 16k, not 8k */
312
+ #define YY_READ_BUF_SIZE 16384
313
+ #else
314
+ #define YY_READ_BUF_SIZE 8192
315
+ #endif /* __ia64__ */
316
+ #endif
317
+
318
+ /* Number of entries by which start-condition stack grows. */
319
+ #ifndef YY_START_STACK_INCR
320
+ #define YY_START_STACK_INCR 25
321
+ #endif
322
+
323
+ /* Default declaration of generated scanner - a define so the user can
324
+ * easily add parameters.
325
+ */
326
+ #ifndef YY_DECL
327
+ #define YY_DECL_IS_OURS 1
328
+
329
+ extern int linguist_yylex (yyscan_t yyscanner);
330
+
331
+ #define YY_DECL int linguist_yylex (yyscan_t yyscanner)
332
+ #endif /* !YY_DECL */
333
+
334
+ /* yy_get_previous_state - get the state just before the EOB char was reached */
335
+
336
+ #undef YY_NEW_FILE
337
+ #undef YY_FLUSH_BUFFER
338
+ #undef yy_set_bol
339
+ #undef yy_new_buffer
340
+ #undef yy_set_interactive
341
+ #undef YY_DO_BEFORE_ACTION
342
+
343
+ #ifdef YY_DECL_IS_OURS
344
+ #undef YY_DECL_IS_OURS
345
+ #undef YY_DECL
346
+ #endif
347
+
348
+ #line 117 "tokenizer.l"
349
+
350
+
351
+ #line 352 "lex.linguist_yy.h"
352
+ #undef linguist_yyIN_HEADER
353
+ #endif /* linguist_yyHEADER_H */
@@ -0,0 +1,64 @@
1
+ #include "ruby.h"
2
+ #include "linguist.h"
3
+ #include "lex.linguist_yy.h"
4
+
5
+ int linguist_yywrap(yyscan_t yyscanner) {
6
+ return 1;
7
+ }
8
+
9
+ static VALUE rb_tokenizer_extract_tokens(VALUE self, VALUE rb_data) {
10
+ YY_BUFFER_STATE buf;
11
+ yyscan_t scanner;
12
+ struct tokenizer_extra extra;
13
+ VALUE ary, s;
14
+ long len;
15
+ int r;
16
+
17
+ Check_Type(rb_data, T_STRING);
18
+
19
+ len = RSTRING_LEN(rb_data);
20
+ if (len > 100000)
21
+ len = 100000;
22
+
23
+ linguist_yylex_init_extra(&extra, &scanner);
24
+ buf = linguist_yy_scan_bytes(RSTRING_PTR(rb_data), (int) len, scanner);
25
+
26
+ ary = rb_ary_new();
27
+ do {
28
+ extra.type = NO_ACTION;
29
+ extra.token = NULL;
30
+ r = linguist_yylex(scanner);
31
+ switch (extra.type) {
32
+ case NO_ACTION:
33
+ break;
34
+ case REGULAR_TOKEN:
35
+ rb_ary_push(ary, rb_str_new2(extra.token));
36
+ free(extra.token);
37
+ break;
38
+ case SHEBANG_TOKEN:
39
+ s = rb_str_new2("SHEBANG#!");
40
+ rb_str_cat2(s, extra.token);
41
+ rb_ary_push(ary, s);
42
+ free(extra.token);
43
+ break;
44
+ case SGML_TOKEN:
45
+ s = rb_str_new2(extra.token);
46
+ rb_str_cat2(s, ">");
47
+ rb_ary_push(ary, s);
48
+ free(extra.token);
49
+ break;
50
+ }
51
+ } while (r);
52
+
53
+ linguist_yy_delete_buffer(buf, scanner);
54
+ linguist_yylex_destroy(scanner);
55
+
56
+ return ary;
57
+ }
58
+
59
+ __attribute__((visibility("default"))) void Init_linguist() {
60
+ VALUE rb_mLinguist = rb_define_module("Linguist");
61
+ VALUE rb_cTokenizer = rb_define_class_under(rb_mLinguist, "Tokenizer", rb_cObject);
62
+
63
+ rb_define_method(rb_cTokenizer, "extract_tokens", rb_tokenizer_extract_tokens, 1);
64
+ }
@@ -0,0 +1,11 @@
1
+ enum tokenizer_type {
2
+ NO_ACTION,
3
+ REGULAR_TOKEN,
4
+ SHEBANG_TOKEN,
5
+ SGML_TOKEN,
6
+ };
7
+
8
+ struct tokenizer_extra {
9
+ char *token;
10
+ enum tokenizer_type type;
11
+ };
@@ -0,0 +1,119 @@
1
+ %{
2
+
3
+ #include "linguist.h"
4
+
5
+ #define feed_token(tok, typ) do { \
6
+ yyextra->token = (tok); \
7
+ yyextra->type = (typ); \
8
+ } while (0)
9
+
10
+ #define eat_until_eol() do { \
11
+ int c; \
12
+ while ((c = input(yyscanner)) != '\n' && c != EOF); \
13
+ if (c == EOF) \
14
+ yyterminate(); \
15
+ } while (0)
16
+
17
+ #define eat_until_unescaped(q) do { \
18
+ int c; \
19
+ while ((c = input(yyscanner)) != EOF) { \
20
+ if (c == '\n') \
21
+ break; \
22
+ if (c == '\\') { \
23
+ c = input(yyscanner); \
24
+ if (c == EOF) \
25
+ yyterminate(); \
26
+ } else if (c == q) \
27
+ break; \
28
+ } \
29
+ if (c == EOF) \
30
+ yyterminate(); \
31
+ } while (0)
32
+
33
+ %}
34
+
35
+ %option never-interactive yywrap reentrant nounput warn nodefault header-file="lex.linguist_yy.h" extra-type="struct tokenizer_extra *" prefix="linguist_yy"
36
+ %x sgml c_comment xml_comment haskell_comment ocaml_comment python_dcomment python_scomment
37
+
38
+ %%
39
+
40
+ ^#![ \t]*([[:alnum:]_\/]*\/)?env([ \t]+([^ \t=]*=[^ \t]*))*[ \t]+[[:alpha:]_]+ {
41
+ const char *off = strrchr(yytext, ' ');
42
+ if (!off)
43
+ off = yytext;
44
+ else
45
+ ++off;
46
+ feed_token(strdup(off), SHEBANG_TOKEN);
47
+ eat_until_eol();
48
+ return 1;
49
+ }
50
+
51
+ ^#![ \t]*[[:alpha:]_\/]+ {
52
+ const char *off = strrchr(yytext, '/');
53
+ if (!off)
54
+ off = yytext;
55
+ else
56
+ ++off;
57
+ if (strcmp(off, "env") == 0) {
58
+ eat_until_eol();
59
+ } else {
60
+ feed_token(strdup(off), SHEBANG_TOKEN);
61
+ eat_until_eol();
62
+ return 1;
63
+ }
64
+ }
65
+
66
+ ^[ \t]*(\/\/|--|\#|%|\")" ".* { /* nothing */ }
67
+
68
+ "/*" { BEGIN(c_comment); }
69
+ /* See below for xml_comment start. */
70
+ "{-" { BEGIN(haskell_comment); }
71
+ "(*" { BEGIN(ocaml_comment); }
72
+ "\"\"\"" { BEGIN(python_dcomment); }
73
+ "'''" { BEGIN(python_scomment); }
74
+
75
+ <c_comment,xml_comment,haskell_comment,ocaml_comment,python_dcomment,python_scomment>.|\n { /* nothing */ }
76
+ <c_comment>"*/" { BEGIN(INITIAL); }
77
+ <xml_comment>"-->" { BEGIN(INITIAL); }
78
+ <haskell_comment>"-}" { BEGIN(INITIAL); }
79
+ <ocaml_comment>"*)" { BEGIN(INITIAL); }
80
+ <python_dcomment>"\"\"\"" { BEGIN(INITIAL); }
81
+ <python_scomment>"'''" { BEGIN(INITIAL); }
82
+
83
+ \"\"|'' { /* nothing */ }
84
+ \" { eat_until_unescaped('"'); }
85
+ ' { eat_until_unescaped('\''); }
86
+ (0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ }
87
+ \<[^ \t\n\r<>]+/>|" "[^<>\n]{0,2048}> {
88
+ if (strcmp(yytext, "<!--") == 0) {
89
+ BEGIN(xml_comment);
90
+ } else {
91
+ feed_token(strdup(yytext), SGML_TOKEN);
92
+ BEGIN(sgml);
93
+ return 1;
94
+ }
95
+ }
96
+ <sgml>[[:alnum:]_]+=/\" { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('"'); return 1; }
97
+ <sgml>[[:alnum:]_]+=/' { feed_token(strdup(yytext), REGULAR_TOKEN); input(yyscanner); eat_until_unescaped('\''); return 1; }
98
+ <sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; }
99
+ <sgml>[[:alnum:]_]+ { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
100
+ <sgml>\> { BEGIN(INITIAL); }
101
+ <sgml>.|\n { /* nothing */ }
102
+ ;|\{|\}|\(|\)|\[|\] { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
103
+ [[:alnum:]_.@#/*]+ {
104
+ if (strncmp(yytext, "/*", 2) == 0) {
105
+ if (strlen(yytext) >= 4 && strcmp(yytext + strlen(yytext) - 2, "*/") == 0) {
106
+ /* nothing */
107
+ } else {
108
+ BEGIN(c_comment);
109
+ }
110
+ } else {
111
+ feed_token(strdup(yytext), REGULAR_TOKEN);
112
+ return 1;
113
+ }
114
+ }
115
+ \<\<?|\+|\-|\*|\/|%|&&?|\|\|? { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
116
+ .|\n { /* nothing */ }
117
+
118
+ %%
119
+