langscan 1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (168) hide show
  1. data/AUTHORS.txt +19 -0
  2. data/History.txt +126 -0
  3. data/Manifest.txt +167 -0
  4. data/README.rdoc +89 -0
  5. data/Rakefile +40 -0
  6. data/ext/langscan/_make_c.rb +20 -0
  7. data/ext/langscan/_make_h.rb +30 -0
  8. data/ext/langscan/_template.c +134 -0
  9. data/ext/langscan/_template.h +53 -0
  10. data/ext/langscan/c/c/Makefile +157 -0
  11. data/ext/langscan/c/c/c.c +134 -0
  12. data/ext/langscan/c/c/c.h +66 -0
  13. data/ext/langscan/c/c/ctok.c +4622 -0
  14. data/ext/langscan/c/c/ctok.l +212 -0
  15. data/ext/langscan/c/c/extconf.rb +3 -0
  16. data/ext/langscan/c/c/modulename.txt +1 -0
  17. data/ext/langscan/c/c/tokenlist.txt +13 -0
  18. data/ext/langscan/csharp/csharp/Makefile +157 -0
  19. data/ext/langscan/csharp/csharp/csharp.c +134 -0
  20. data/ext/langscan/csharp/csharp/csharp.h +65 -0
  21. data/ext/langscan/csharp/csharp/csharptok.c +2965 -0
  22. data/ext/langscan/csharp/csharp/csharptok.l +200 -0
  23. data/ext/langscan/csharp/csharp/extconf.rb +3 -0
  24. data/ext/langscan/csharp/csharp/modulename.txt +1 -0
  25. data/ext/langscan/csharp/csharp/tokenlist.txt +12 -0
  26. data/ext/langscan/d/d/Makefile +157 -0
  27. data/ext/langscan/d/d/d.c +134 -0
  28. data/ext/langscan/d/d/d.h +64 -0
  29. data/ext/langscan/d/d/dtok.c +5461 -0
  30. data/ext/langscan/d/d/dtok.l +282 -0
  31. data/ext/langscan/d/d/extconf.rb +3 -0
  32. data/ext/langscan/d/d/modulename.txt +1 -0
  33. data/ext/langscan/d/d/tokenlist.txt +11 -0
  34. data/ext/langscan/elisp/elisp/Makefile +157 -0
  35. data/ext/langscan/elisp/elisp/elisp.c +134 -0
  36. data/ext/langscan/elisp/elisp/elisp.h +62 -0
  37. data/ext/langscan/elisp/elisp/elisptok.c +2101 -0
  38. data/ext/langscan/elisp/elisp/elisptok.l +151 -0
  39. data/ext/langscan/elisp/elisp/extconf.rb +3 -0
  40. data/ext/langscan/elisp/elisp/modulename.txt +1 -0
  41. data/ext/langscan/elisp/elisp/tokenlist.txt +9 -0
  42. data/ext/langscan/java/java/Makefile +157 -0
  43. data/ext/langscan/java/java/extconf.rb +3 -0
  44. data/ext/langscan/java/java/java.c +134 -0
  45. data/ext/langscan/java/java/java.h +64 -0
  46. data/ext/langscan/java/java/javatok.c +2090 -0
  47. data/ext/langscan/java/java/javatok.l +155 -0
  48. data/ext/langscan/java/java/modulename.txt +1 -0
  49. data/ext/langscan/java/java/tokenlist.txt +11 -0
  50. data/ext/langscan/javascript/javascript/Makefile +157 -0
  51. data/ext/langscan/javascript/javascript/extconf.rb +3 -0
  52. data/ext/langscan/javascript/javascript/javascript.c +134 -0
  53. data/ext/langscan/javascript/javascript/javascript.h +63 -0
  54. data/ext/langscan/javascript/javascript/javascripttok.c +2051 -0
  55. data/ext/langscan/javascript/javascript/javascripttok.l +147 -0
  56. data/ext/langscan/javascript/javascript/modulename.txt +1 -0
  57. data/ext/langscan/javascript/javascript/tokenlist.txt +10 -0
  58. data/ext/langscan/pairmatcher/pairmatcher/Makefile +157 -0
  59. data/ext/langscan/pairmatcher/pairmatcher/extconf.rb +3 -0
  60. data/ext/langscan/pairmatcher/pairmatcher/pairmatcher.c +890 -0
  61. data/ext/langscan/php/php/Makefile +157 -0
  62. data/ext/langscan/php/php/extconf.rb +3 -0
  63. data/ext/langscan/php/php/modulename.txt +1 -0
  64. data/ext/langscan/php/php/php.c +134 -0
  65. data/ext/langscan/php/php/php.h +64 -0
  66. data/ext/langscan/php/php/phptok.c +2406 -0
  67. data/ext/langscan/php/php/phptok.l +212 -0
  68. data/ext/langscan/php/php/tokenlist.txt +11 -0
  69. data/ext/langscan/post-distclean.rb +21 -0
  70. data/ext/langscan/pre-config.rb +57 -0
  71. data/ext/langscan/python/python/Makefile +157 -0
  72. data/ext/langscan/python/python/extconf.rb +3 -0
  73. data/ext/langscan/python/python/modulename.txt +1 -0
  74. data/ext/langscan/python/python/python.c +134 -0
  75. data/ext/langscan/python/python/python.h +61 -0
  76. data/ext/langscan/python/python/pythontok.c +2102 -0
  77. data/ext/langscan/python/python/pythontok.l +155 -0
  78. data/ext/langscan/python/python/tokenlist.txt +8 -0
  79. data/ext/langscan/ruby/compat/ripper/Makefile +158 -0
  80. data/ext/langscan/ruby/compat/ripper/depend +1 -0
  81. data/ext/langscan/ruby/compat/ripper/extconf.rb +4 -0
  82. data/ext/langscan/ruby/compat/ripper/include/eventids1.c +251 -0
  83. data/ext/langscan/ruby/compat/ripper/include/eventids2.c +277 -0
  84. data/ext/langscan/ruby/compat/ripper/include/lex.c +138 -0
  85. data/ext/langscan/ruby/compat/ripper/ripper.c +14420 -0
  86. data/ext/langscan/scheme/scheme/Makefile +157 -0
  87. data/ext/langscan/scheme/scheme/extconf.rb +3 -0
  88. data/ext/langscan/scheme/scheme/modulename.txt +1 -0
  89. data/ext/langscan/scheme/scheme/scheme.c +134 -0
  90. data/ext/langscan/scheme/scheme/scheme.h +60 -0
  91. data/ext/langscan/scheme/scheme/schemetok.c +2447 -0
  92. data/ext/langscan/scheme/scheme/schemetok.l +177 -0
  93. data/ext/langscan/scheme/scheme/tokenlist.txt +7 -0
  94. data/ext/langscan/sh/sh/Makefile +157 -0
  95. data/ext/langscan/sh/sh/extconf.rb +3 -0
  96. data/ext/langscan/sh/sh/modulename.txt +1 -0
  97. data/ext/langscan/sh/sh/sh.c +134 -0
  98. data/ext/langscan/sh/sh/sh.h +61 -0
  99. data/ext/langscan/sh/sh/shtok.c +2470 -0
  100. data/ext/langscan/sh/sh/shtok.l +325 -0
  101. data/ext/langscan/sh/sh/tokenlist.txt +8 -0
  102. data/lib/langscan.rb +124 -0
  103. data/lib/langscan/_common.rb +50 -0
  104. data/lib/langscan/_easyscanner.rb +78 -0
  105. data/lib/langscan/_pairmatcher.rb +46 -0
  106. data/lib/langscan/_type.rb +125 -0
  107. data/lib/langscan/autoconf.rb +51 -0
  108. data/lib/langscan/automake.rb +51 -0
  109. data/lib/langscan/brainfuck.rb +48 -0
  110. data/lib/langscan/c.rb +144 -0
  111. data/lib/langscan/csharp.rb +101 -0
  112. data/lib/langscan/css.rb +109 -0
  113. data/lib/langscan/d.rb +201 -0
  114. data/lib/langscan/eiffel.rb +167 -0
  115. data/lib/langscan/elisp.rb +132 -0
  116. data/lib/langscan/io.rb +84 -0
  117. data/lib/langscan/java.rb +95 -0
  118. data/lib/langscan/javascript.rb +97 -0
  119. data/lib/langscan/lua.rb +116 -0
  120. data/lib/langscan/ocaml.rb +298 -0
  121. data/lib/langscan/ocaml/camlexer.ml +28 -0
  122. data/lib/langscan/ocaml/lexer.mll +230 -0
  123. data/lib/langscan/ocaml/types.ml +36 -0
  124. data/lib/langscan/perl.rb +87 -0
  125. data/lib/langscan/perl/tokenizer.pl +231 -0
  126. data/lib/langscan/php.rb +80 -0
  127. data/lib/langscan/python.rb +101 -0
  128. data/lib/langscan/rpmspec.rb +71 -0
  129. data/lib/langscan/ruby.rb +164 -0
  130. data/lib/langscan/ruby/compat/README +5 -0
  131. data/lib/langscan/ruby/compat/ripper.rb +4 -0
  132. data/lib/langscan/ruby/compat/ripper/core.rb +918 -0
  133. data/lib/langscan/ruby/compat/ripper/filter.rb +70 -0
  134. data/lib/langscan/ruby/compat/ripper/lexer.rb +179 -0
  135. data/lib/langscan/ruby/compat/ripper/sexp.rb +100 -0
  136. data/lib/langscan/scheme.rb +160 -0
  137. data/lib/langscan/sh.rb +116 -0
  138. data/lib/langscan/text.rb +37 -0
  139. data/metaconfig +2 -0
  140. data/script/console +10 -0
  141. data/script/destroy +14 -0
  142. data/script/generate +14 -0
  143. data/script/makemanifest.rb +21 -0
  144. data/setup.rb +1604 -0
  145. data/tasks/extconf.rake +13 -0
  146. data/tasks/extconf/langscan.rake +42 -0
  147. data/test/langscan/brainfuck/test/test_scan.rb +55 -0
  148. data/test/langscan/c/test/test_scan.rb +216 -0
  149. data/test/langscan/c/test/test_token.rb +41 -0
  150. data/test/langscan/csharp/test/test_scan.rb +157 -0
  151. data/test/langscan/css/test/test_css.rb +79 -0
  152. data/test/langscan/d/test/test_scan.rb +233 -0
  153. data/test/langscan/d/test/test_token.rb +205 -0
  154. data/test/langscan/eiffel/test/test_eiffel.rb +95 -0
  155. data/test/langscan/elisp/test/test_elisp.rb +177 -0
  156. data/test/langscan/io/test/test_io.rb +79 -0
  157. data/test/langscan/java/test/test_java.rb +74 -0
  158. data/test/langscan/javascript/test/test_javascript.rb +39 -0
  159. data/test/langscan/lua/test/test_lua.rb +69 -0
  160. data/test/langscan/ocaml/test/test_ocaml.rb +161 -0
  161. data/test/langscan/php/test/test_scan.rb +138 -0
  162. data/test/langscan/python/test/test_scan.rb +105 -0
  163. data/test/langscan/rpmspec/test/test_rpmspec.rb +51 -0
  164. data/test/langscan/ruby/test/test_scan.rb +71 -0
  165. data/test/langscan/scheme/test/test_scan.rb +198 -0
  166. data/test/test_helper.rb +7 -0
  167. data/test/test_langscan.rb +123 -0
  168. metadata +296 -0
@@ -0,0 +1,28 @@
1
+ (*
2
+ camlexer - Lexical Analyzer for Gonzui ocamlsupport
3
+
4
+ Copyright (C) 2005 Soutaro Matsumoto <matsumoto@soutaro.com>
5
+ All rights reserved.
6
+ This is free software with ABSOLUTELY NO WARRANTY.
7
+
8
+ You can redistribute it and/or modify it under the terms of
9
+ the GNU General Public License version 2.
10
+ *)
11
+
12
+ (* $Id: camlexer.ml,v 1.1.1.1 2005/09/15 19:38:39 bashi Exp $ *)
13
+
14
+ let main () =
15
+ try
16
+ let lexbuf = Lexing.from_channel stdin in
17
+ while true do
18
+ let ((lnum,bnum),tname,lexed_str) = (Lexer.token lexbuf) in
19
+ begin
20
+ Printf.printf "%d:%d:%s:%s\n" lnum bnum (Types.to_string tname) lexed_str;
21
+ flush stdout;
22
+ end
23
+ done
24
+ with
25
+ Lexer.EOF -> exit 0
26
+
27
+ let _ = main ()
28
+
@@ -0,0 +1,230 @@
1
+ (*
2
+ camlexer - Lexical Analyzer for Gonzui ocamlsupport
3
+
4
+ Copyright (C) 2005 Soutaro Matsumoto <matsumoto@soutaro.com>
5
+ All rights reserved.
6
+ This is free software with ABSOLUTELY NO WARRANTY.
7
+
8
+ You can redistribute it and/or modify it under the terms of
9
+ the GNU General Public License version 2.
10
+
11
+ *)
12
+
13
+ (* $Id: lexer.mll,v 1.1.1.1 2005/09/15 19:38:39 bashi Exp $ *)
14
+
15
+ {
16
+ exception EOF
17
+
18
+ open Types
19
+
20
+ let lnum = ref 1
21
+ let inc_lnum () =
22
+ begin
23
+ lnum := !lnum+1;
24
+ end
25
+
26
+ let reset () =
27
+ lnum := 1
28
+
29
+ let get_pos lexbuf =
30
+ let pos = Lexing.lexeme_start_p lexbuf in
31
+ let boff = pos.Lexing.pos_bol in
32
+ let cnum = pos.Lexing.pos_cnum in
33
+ !lnum,boff+cnum
34
+
35
+ let str_lexbuf = ref (None: (Lexing.lexbuf) option)
36
+
37
+ }
38
+
39
+ let newline = ('\n' | '\r' | "\r\n")
40
+ let blank = [' ' '\t']
41
+ let letter = ['a'-'z' 'A'-'Z']
42
+ let num = ['0'-'9']
43
+ let ident = (letter | '_') (letter | num | '_' | '\'')*
44
+ let int_lit =
45
+ (('-')? num (num | '_')*)
46
+ | (('-')? ("0x"|"0X") (num | ['A'-'F'] ['a'-'f']) (num | ['A'-'F'] ['a'-'f'] | '_')*)
47
+ | (('-')? ("0o"|"0O") (['0'-'7']) (['0'-'7'] | '_')*)
48
+ | (('-')? ("0b"|"0B") (['0'-'1']) (['0'-'1'] | '_')*)
49
+ let float_lit =
50
+ ('-')? num (num | '_')* ('.' (num | '_')*)? (("e"|"E") ('+'|'-')? num (num | '_')*)?
51
+ let regular_char = [^ '\'']
52
+ let escape_sequence =
53
+ '\\' ['\\' '\"' '\'' 'n' 't' 'b' 'r']
54
+ | '\\' num num num
55
+ | "\\x" (num | ['A'-'F'] | ['a'-'f']) (num | ['A'-'F'] | ['a'-'f'])
56
+ let char_lit =
57
+ '\'' regular_char '\''
58
+ | '\'' escape_sequence '\''
59
+ let label = ['a'-'z'] (letter | num | '_' | '\'')*
60
+ let operator_char = ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
61
+ let infix_symbol = ['=' '<' '>' '@' '|' '&' '+' '-' '*' '/' '$' '%'] operator_char*
62
+ let prefix_symbol = ['!' '?' '~'] operator_char*
63
+ let keywords =
64
+ "and" | "as" | "assert" | "asr" | "begin" | "class"
65
+ | "constraint" | "do" | "done" | "downto" | "else" | "end"
66
+ | "exception" | "external" | "false" | "for" | "fun" | "function"
67
+ | "functor" | "if" | "in" | "include" | "inherit" | "initializer"
68
+ | "land" | "lazy" | "let" | "lor" | "lsl" | "lsr"
69
+ | "lxor" | "match" | "method" | "mod" | "module" | "mutable"
70
+ | "new" | "object" | "of" | "open" | "or" | "private"
71
+ | "rec" | "sig" | "struct" | "then" | "to" | "true"
72
+ | "try" | "type" | "val" | "virtual" | "when" | "while" | "with"
73
+ let puncts =
74
+ "!=" | "#" | "&" | "&&" | "\'" | "(" | ")" | "*" | "+" | "," | "-"
75
+ | "-." | "->" | "." | ".." | ":" | "::" | ":=" | ":>" | ";" | ";;" | "<"
76
+ | "<-" | "=" | ">" | ">]" | ">}" | "?" | "??" | "[" | "[<" | "[>" | "[|"
77
+ | "]" | "_" | "`" | "{" | "{<" | "|" | "|]" | "}" | "~"
78
+ let camlp4_keywords = "parser"
79
+ let camlp4_puncts =
80
+ "<<" | "<:" | ">>" | "$" | "$$" | "$:"
81
+ let ocamlyacc_keywords =
82
+ "%token" | "%start" | "%type" | "%left" | "%right" | "%nonassoc" | "%prec"
83
+ let ocamlyacc_puncts =
84
+ "%{" | "%}" | "%%"
85
+ let ocamlyacc_ident = "$" num+
86
+ let linenum_directive = '#' ' ' num+
87
+ | '#' ' ' num+ ' ' '\"' [^ '\"']* '\"'
88
+ let built_in_constants = "false" | "true" | "()" | "[]"
89
+
90
+ rule token = parse
91
+ | newline
92
+ {
93
+ begin
94
+ inc_lnum();
95
+ token lexbuf;
96
+ end
97
+ }
98
+ | blank +
99
+ { token lexbuf }
100
+ | linenum_directive {
101
+ (get_pos lexbuf, Ttext, Lexing.lexeme lexbuf);
102
+ }
103
+ | keywords | camlp4_keywords | ocamlyacc_keywords {
104
+ (get_pos lexbuf, Tkeyword, Lexing.lexeme lexbuf)
105
+ }
106
+ | built_in_constants {
107
+ (get_pos lexbuf, Tkeyword, Lexing.lexeme lexbuf)
108
+ }
109
+ | "/*" {
110
+ let pos = get_pos lexbuf in
111
+ (pos, Tcomment, ocamlyacc_comment 0 "/*" lexbuf);
112
+ }
113
+ | "(*" {
114
+ let pos = get_pos lexbuf in
115
+ (pos, Tcomment, comment 0 "(*" lexbuf)
116
+ }
117
+ | '\"' {
118
+ let pos = get_pos lexbuf in
119
+ (pos, Tstring, string "\"" lexbuf) }
120
+ | puncts | camlp4_puncts | ocamlyacc_puncts {
121
+ (get_pos lexbuf, Tpunct, Lexing.lexeme lexbuf)
122
+ }
123
+ | infix_symbol | prefix_symbol {
124
+ (get_pos lexbuf, Tident, Lexing.lexeme lexbuf)
125
+ }
126
+ | ('~'|'?') label ':' {
127
+ let s = Lexing.lexeme lexbuf in
128
+ let name = String.sub s 1 (String.length s - 2) in
129
+ (get_pos lexbuf, Tident, s)
130
+ }
131
+ | ident | ocamlyacc_ident {
132
+ (get_pos lexbuf, Tident, Lexing.lexeme lexbuf)
133
+ }
134
+ | char_lit {
135
+ (get_pos lexbuf, Tchar, Lexing.lexeme lexbuf)
136
+ }
137
+ | int_lit {
138
+ (get_pos lexbuf, Tint, Lexing.lexeme lexbuf)
139
+ }
140
+ | float_lit {
141
+ (get_pos lexbuf, Tfloat, Lexing.lexeme lexbuf)
142
+ }
143
+ | eof { raise EOF }
144
+ | _
145
+ { token lexbuf }
146
+
147
+ and comment lv acc = parse
148
+ | newline {
149
+ begin
150
+ inc_lnum();
151
+ comment lv (acc ^ "\\o") lexbuf;
152
+ end
153
+ }
154
+ | "(*" {
155
+ comment (lv+1) (acc ^ Lexing.lexeme lexbuf) lexbuf
156
+ }
157
+ | "*)" {
158
+ if lv = 0
159
+ then
160
+ acc ^ "*)"
161
+ else
162
+ comment (lv-1) (acc ^ Lexing.lexeme lexbuf) lexbuf
163
+ }
164
+ | ([^ '\\'] as c1) "\"" {
165
+ let s = string "\"" lexbuf in
166
+ match !str_lexbuf with
167
+ Some lexbuf -> comment lv (acc ^ Printf.sprintf "%c" c1 ^ s) lexbuf
168
+ }
169
+ | char_lit {
170
+ comment lv (acc ^ Lexing.lexeme lexbuf) lexbuf
171
+ }
172
+ | _ {
173
+ let s = Lexing.lexeme lexbuf in
174
+ comment lv (acc^s) lexbuf
175
+ }
176
+
177
+ and string acc = parse
178
+ | newline {
179
+ begin
180
+ inc_lnum();
181
+ string (acc ^ "\\o") lexbuf;
182
+ end
183
+ }
184
+ | '\"' {
185
+ begin
186
+ str_lexbuf := Some lexbuf;
187
+ acc ^ "\"";
188
+ end
189
+ }
190
+ | escape_sequence {
191
+ string (acc ^ Lexing.lexeme lexbuf) lexbuf
192
+ }
193
+ | char_lit {
194
+ string (acc ^ Lexing.lexeme lexbuf) lexbuf
195
+ }
196
+ | _ {
197
+ let s = Lexing.lexeme lexbuf in
198
+ string (acc^s) lexbuf
199
+ }
200
+
201
+ and ocamlyacc_comment lv acc = parse
202
+ | newline {
203
+ begin
204
+ inc_lnum();
205
+ ocamlyacc_comment lv (acc ^ "\\o") lexbuf;
206
+ end
207
+ }
208
+ | "/*" {
209
+ ocamlyacc_comment (lv+1) (acc ^ Lexing.lexeme lexbuf) lexbuf
210
+ }
211
+ | "*/" {
212
+ if lv = 0
213
+ then
214
+ acc ^ "*/"
215
+ else
216
+ ocamlyacc_comment (lv-1) (acc ^ Lexing.lexeme lexbuf) lexbuf
217
+ }
218
+ | "\"" {
219
+ let s = string "\"" lexbuf in
220
+ match !str_lexbuf with
221
+ Some lexbuf -> ocamlyacc_comment lv (acc ^ s) lexbuf
222
+ }
223
+ | char_lit {
224
+ ocamlyacc_comment lv (acc ^ Lexing.lexeme lexbuf) lexbuf
225
+ }
226
+ | _ {
227
+ let s = Lexing.lexeme lexbuf in
228
+ ocamlyacc_comment lv (acc^s) lexbuf
229
+ }
230
+
@@ -0,0 +1,36 @@
1
+ (*
2
+ camlexer - Lexical Analyzer for Gonzui ocamlsupport
3
+
4
+ Copyright (C) 2005 Soutaro Matsumoto <matsumoto@soutaro.com>
5
+ All rights reserved.
6
+ This is free software with ABSOLUTELY NO WARRANTY.
7
+
8
+ You can redistribute it and/or modify it under the terms of
9
+ the GNU General Public License version 2.
10
+ *)
11
+
12
+ (* $Id: types.ml,v 1.1.1.1 2005/09/15 19:38:38 bashi Exp $ *)
13
+
14
+ type gonzui_type = Tident
15
+ | Tpunct
16
+ | Tfuncdef
17
+ | Ttext
18
+ | Tstring
19
+ | Tcomment
20
+ | Tkeyword
21
+ | Tchar
22
+ | Tint
23
+ | Tfloat
24
+
25
+ let to_string = function
26
+ Tident -> "ident"
27
+ | Tpunct -> "punct"
28
+ | Tfuncdef -> "funcdef"
29
+ | Ttext -> "text"
30
+ | Tstring -> "string"
31
+ | Tcomment -> "comment"
32
+ | Tkeyword -> "keyword"
33
+ | Tchar -> "character"
34
+ | Tfloat -> "float"
35
+ | Tint -> "integer"
36
+
@@ -0,0 +1,87 @@
1
+ #
2
+ # perl.rb - a Perl module of LangScan
3
+ #
4
+ # Copyright (C) 2005 Tatsuhiko Miyagawa <miyagawa@bulknews.net>
5
+ # All rights reserved.
6
+ # This is free software with ABSOLUTELY NO WARRANTY.
7
+ #
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the GNU General Public License version 2.
10
+ #
11
+
12
+ dn = "/dev/null"
13
+ dn = "nul" if (/mswin|mingw|bccwin/ =~ RUBY_PLATFORM)
14
+ unless system("perl -MPPI -e 1 2>#{dn}")
15
+ raise LoadError.new("PPI module is required")
16
+ end
17
+
18
+ require 'langscan/_common'
19
+
20
+ module LangScan
21
+ module Perl
22
+ module_function
23
+
24
+ def name
25
+ "Perl"
26
+ end
27
+
28
+ def abbrev
29
+ "perl"
30
+ end
31
+
32
+ def extnames
33
+ [".pl", ".PL", ".pm", ".t" ] # XXX remove ".t"
34
+ end
35
+
36
+ PERLTOKENIZER_PATH = $LOAD_PATH.map {|path|
37
+ File.join(path, "langscan/perl/tokenizer.pl")
38
+ }.find {|path| File.file?(path) }
39
+ raise "tokenizer.pl not found" if PERLTOKENIZER_PATH.nil?
40
+
41
+ def shell_escape(file_name)
42
+ '"' + file_name.gsub(/([$"\\`])/, "\\\\\\1") + '"'
43
+ end
44
+
45
+ def open_tokenizer
46
+ command_line = sprintf("perl %s 2>/dev/null",
47
+ shell_escape(PERLTOKENIZER_PATH))
48
+ @io = IO.popen(command_line, "r+")
49
+ end
50
+
51
+ # LangScan::Perl.scan iterates over Perl program.
52
+ # It yields for each Fragment.
53
+ def scan(input)
54
+ open_tokenizer if @io.nil? or @io.closed? # in case of Perl error
55
+ @io.puts(input.length)
56
+ @io.write(input)
57
+ inputlen = input.length
58
+ buflen = 0
59
+ begin
60
+ while (buflen < inputlen)
61
+ type = @io.readline.chomp.intern
62
+ lineno = @io.readline.chomp.to_i
63
+ byteno = @io.readline.chomp.to_i
64
+ bodylen = @io.readline.chomp.to_i
65
+ text = @io.read(bodylen)
66
+ if type.nil? or text.nil? or lineno.nil? or byteno.nil?
67
+ raise ScanFailed.new("Unexpected output from tokenizer.pl")
68
+ end
69
+ yield Fragment.new(type, text, lineno, byteno)
70
+ @io.read(1) # newline
71
+ buflen += bodylen
72
+ end
73
+ rescue EOFError
74
+ @io.close
75
+ raise ScanFailed.new("tokenizer.pl failed to parse")
76
+ end
77
+ end
78
+
79
+ LangScan.register(self)
80
+ end
81
+ end
82
+
83
+
84
+
85
+
86
+
87
+
@@ -0,0 +1,231 @@
1
+ # tokenizer.pl: tokenize Perl scripts as gonzui langscan format
2
+ #
3
+ # Author: Tatsuhiko Miyagawa <miyagawa@bulknews.net>
4
+ # License: Same as Perl (Artistic/GPL2)
5
+ #
6
+
7
+ use strict;
8
+ use PPI::Tokenizer;
9
+ $PPI::Tokenizer::ALLOW_NONASCII = 1;
10
+
11
+ our $Debug = 0;
12
+ $| = 1;
13
+
14
+ # TODO:
15
+ # 'string' is abused
16
+ # regexp is string
17
+ # PPI fails to tokenize source code with UTF-8 binary
18
+
19
+ our(%TokenMap, %ReservedWords, %BuiltinFunctions);
20
+
21
+ if ($ARGV[0] && $ARGV[0] eq '-d') {
22
+ # debug mode
23
+ open my $fh, $ARGV[1] or die "$ARGV[1]: $!";
24
+ my $code = join '', <$fh>;
25
+ Tokenizer->new->tokenize(\$code);
26
+ } else {
27
+ # persistent mode
28
+ my $tokenizer = Tokenizer->new;
29
+ while (1) {
30
+ chomp(my $length = <STDIN>);
31
+ last unless defined $length;
32
+ read(STDIN, my($code), $length);
33
+ $tokenizer->tokenize(\$code);
34
+ $tokenizer->reset();
35
+ }
36
+ }
37
+
38
+ package Tokenizer;
39
+
40
+ sub new {
41
+ my $class = shift;
42
+ my $self = bless { }, $class;
43
+ $self->reset();
44
+ $self;
45
+ }
46
+
47
+ sub reset {
48
+ my $self = shift;
49
+ $self->{lineno} = 0;
50
+ $self->{byteno} = 0;
51
+ $self->{heredoc} = undef;
52
+ $self->{in_sub} = undef;
53
+ $self->{in_package} = undef;
54
+ $self->{in_arrow} = undef;
55
+ $self->{in_usereq} = undef;
56
+ }
57
+
58
+ sub tokenize {
59
+ my($self, $coderef) = @_;
60
+ my $tokenizer = PPI::Tokenizer->new($coderef) or die "Can't tokenize code: $$coderef";
61
+ while (my $token = $tokenizer->get_token) {
62
+ $self->dump_element($token);
63
+ }
64
+ my $code_length = length $$coderef;
65
+ $self->{byteno} == $code_length or die "Tokenize error: $self->{byteno}:$code_length";
66
+ }
67
+
68
+ sub dump_element {
69
+ my($self, $element) = @_;
70
+ if ($element->isa('PPI::Token::HereDoc')) {
71
+ $self->_dump("punct", $element->content);
72
+ $self->{heredoc} ||= [];
73
+ push @{$self->{heredoc}}, {
74
+ body => $element->{_heredoc},
75
+ eof => $element->{_terminator_line},
76
+ };
77
+ return;
78
+ } elsif ($self->{heredoc} && $element->isa('PPI::Token::Whitespace') && $element->content eq "\n") {
79
+ $self->_dump(token_name($element), $element->content);
80
+ for my $heredoc (@{$self->{heredoc}}) {
81
+ $self->_dump(string => join "", @{$heredoc->{body}});
82
+ $self->_dump(punct => $heredoc->{eof});
83
+ }
84
+ $self->{heredoc} = undef;
85
+ return;
86
+ } elsif ($element->isa('PPI::Token::Word') && $element->content eq 'sub') {
87
+ $self->{in_sub} = 1;
88
+ } elsif ($element->isa('PPI::Token::Word') && $element->content eq 'package') {
89
+ $self->{in_package} = 1;
90
+ } elsif ($element->isa('PPI::Token::Word') && ($element->content eq 'use' || $element->content eq 'require')) {
91
+ $self->{in_usereq} = 1;
92
+ } elsif ($element->isa('PPI::Token::Operator') && $element->content eq '->') {
93
+ $self->{in_arrow} = 1;
94
+ } elsif ($self->{in_sub} && !$element->isa('PPI::Token::Whitespace')) {
95
+ $self->{in_sub} = undef;
96
+ if ($element->isa('PPI::Token::Word')) {
97
+ warn "sub $element->{content}\n" if $Debug;
98
+ $self->_dump(fundef => $element->content);
99
+ return;
100
+ }
101
+ } elsif ($self->{in_package} && !$element->isa('PPI::Token::Whitespace')) {
102
+ $self->{in_package} = undef;
103
+ if ($element->isa('PPI::Token::Word')) {
104
+ warn "package $element->{content}\n" if $Debug;
105
+ $self->_dump(classdef => $element->content);
106
+ return;
107
+ }
108
+ } elsif ($self->{in_arrow} && !$element->isa('PPI::Token::Whitespace')) {
109
+ $self->{in_arrow} = undef;
110
+ if ($element->isa('PPI::Token::Word')) {
111
+ warn "->$element->{content}\n" if $Debug;
112
+ $self->_dump(funcall => $element->content);
113
+ return;
114
+ }
115
+ } elsif ($self->{in_usereq} && !$element->isa('PPI::Token::Whitespace')) {
116
+ $self->{in_usereq} = undef;
117
+ if ($element->isa('PPI::Token::Word')) {
118
+ warn "use $element->{content}\n" if $Debug;
119
+ $self->_dump(classref => $element->content);
120
+ return;
121
+ }
122
+ }
123
+ $self->_dump(token_name($element), $element->content);
124
+ }
125
+
126
+ sub _dump {
127
+ my($self, $type, $text) = @_;
128
+ my $bodysize = length $text;
129
+ print <<DUMP;
130
+ $type
131
+ $self->{lineno}
132
+ $self->{byteno}
133
+ $bodysize
134
+ $text
135
+ DUMP
136
+ ;
137
+ $self->{byteno} += $bodysize;
138
+ $self->{lineno} += $text =~ tr/\n//d;
139
+ }
140
+
141
+ sub token_name {
142
+ my $token = shift;
143
+ if ($token->isa('PPI::Token::Word')) {
144
+ return $ReservedWords{$token->content} ? "keyword" :
145
+ $BuiltinFunctions{$token->content} ? "funcall" : "word";
146
+ } elsif (ref($token) eq 'PPI::Token::Number') {
147
+ return $token->{_subtype} eq 'base256' ? "floating" : "integer";
148
+ }
149
+ $TokenMap{ref($token)} || "word";
150
+ }
151
+
152
+ BEGIN {
153
+ %TokenMap = qw(
154
+ PPI::Token::ArrayIndex ident
155
+ PPI::Token::Attribute fundef
156
+ PPI::Token::Cast punct
157
+ PPI::Token::Comment text
158
+ PPI::Token::DashedWord punct
159
+ PPI::Token::Data text
160
+ PPI::Token::End punct
161
+ PPI::Token::HereDoc *
162
+ PPI::Token::Label word
163
+ PPI::Token::Magic punct
164
+ PPI::Token::Number *
165
+ PPI::Token::Operator punct
166
+ PPI::Token::Pod text
167
+ PPI::Token::Prototype punct
168
+ PPI::Token::Quote::Double string
169
+ PPI::Token::Quote::Interpolate string
170
+ PPI::Token::Quote::Literal string
171
+ PPI::Token::Quote::Single string
172
+ PPI::Token::QuoteLike::Backtick string
173
+ PPI::Token::QuoteLike::Command string
174
+ PPI::Token::QuoteLike::Readline string
175
+ PPI::Token::QuoteLike::Regexp string
176
+ PPI::Token::QuoteLike::Words string
177
+ PPI::Token::Regexp::Match word
178
+ PPI::Token::Regexp::Substitute word
179
+ PPI::Token::Regexp::Transliterate word
180
+ PPI::Token::Separator punct
181
+ PPI::Token::Structure punct
182
+ PPI::Token::Symbol ident
183
+ PPI::Token::Unknown punct
184
+ PPI::Token::Whitespace punct
185
+ PPI::Token::Word *
186
+ );
187
+
188
+ # borrowed from Apache::PrettyPerl, with slight fixes
189
+ %ReservedWords = map { $_ => 1 } qw(
190
+ while until for foreach unless if elsif else do
191
+ package use no require import and or eq ne cmp
192
+ my our local next last redo goto return sub
193
+ );
194
+ %BuiltinFunctions = map { $_ => 1 } qw(
195
+ abs accept alarm atan2 bind binmode bless
196
+ caller chdir chmod chomp chop chown chr
197
+ chroot close closedir connect continue cos
198
+ crypt dbmclose dbmopen defined delete die
199
+ dump each endgrent endhostent endnetent
200
+ endprotoent endpwent endservent eof eval
201
+ exec exists exit exp fcntl fileno flock
202
+ fork format formline getc getgrent getgrgid
203
+ getgrnam gethostbyaddr gethostbyname gethostent
204
+ getlogin getnetbyaddr getnetbyname getnetent
205
+ getpeername getpgrp getppid getpriority
206
+ getprotobyname getprotobynumber getprotoent
207
+ getpwent getpwnam getpwuid getservbyname
208
+ getservbyport getservent getsockname
209
+ getsockopt glob gmtime goto grep hex index
210
+ int ioctl join keys kill last lc lcfirst
211
+ length link listen local localtime log
212
+ lstat map mkdir msgctl msgget msgrcv
213
+ msgsnd my next oct open opendir ord our pack
214
+ pipe pop pos print printf prototype push
215
+ quotemeta rand read readdir readline
216
+ readlink readpipe recv redo ref rename
217
+ reset return reverse rewinddir rindex
218
+ rmdir scalar seek seekdir select semctl
219
+ semget semop send setgrent sethostent
220
+ setnetent setpgrp setpriority setprotoent
221
+ setpwent setservent setsockopt shift shmctl
222
+ shmget shmread shmwrite shutdown sin sleep
223
+ socket socketpair sort splice split sprintf
224
+ sqrt srand stat study sub substr symlink
225
+ syscall sysopen sysread sysread sysseek
226
+ system syswrite tell telldir tie tied
227
+ time times truncate uc ucfirst umask undef
228
+ unlink unpack unshift untie utime values
229
+ vec wait waitpid wantarray warn write
230
+ );
231
+ }