coderay 0.9.8 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. data/{lib/README → README_INDEX.rdoc} +10 -21
  2. data/Rakefile +6 -6
  3. data/bin/coderay +193 -64
  4. data/lib/coderay.rb +61 -105
  5. data/lib/coderay/duo.rb +17 -21
  6. data/lib/coderay/encoder.rb +100 -112
  7. data/lib/coderay/encoders/_map.rb +12 -7
  8. data/lib/coderay/encoders/comment_filter.rb +12 -30
  9. data/lib/coderay/encoders/count.rb +29 -11
  10. data/lib/coderay/encoders/debug.rb +32 -20
  11. data/lib/coderay/encoders/div.rb +13 -9
  12. data/lib/coderay/encoders/filter.rb +34 -51
  13. data/lib/coderay/encoders/html.rb +155 -161
  14. data/lib/coderay/encoders/html/css.rb +4 -9
  15. data/lib/coderay/encoders/html/numbering.rb +115 -0
  16. data/lib/coderay/encoders/html/output.rb +22 -70
  17. data/lib/coderay/encoders/json.rb +59 -45
  18. data/lib/coderay/encoders/lines_of_code.rb +12 -57
  19. data/lib/coderay/encoders/null.rb +6 -14
  20. data/lib/coderay/encoders/page.rb +13 -9
  21. data/lib/coderay/encoders/span.rb +13 -9
  22. data/lib/coderay/encoders/statistic.rb +58 -39
  23. data/lib/coderay/encoders/terminal.rb +179 -0
  24. data/lib/coderay/encoders/text.rb +31 -17
  25. data/lib/coderay/encoders/token_kind_filter.rb +111 -0
  26. data/lib/coderay/encoders/xml.rb +19 -18
  27. data/lib/coderay/encoders/yaml.rb +37 -9
  28. data/lib/coderay/for_redcloth.rb +4 -4
  29. data/lib/coderay/helpers/file_type.rb +127 -246
  30. data/lib/coderay/helpers/gzip.rb +41 -0
  31. data/lib/coderay/helpers/plugin.rb +241 -306
  32. data/lib/coderay/helpers/word_list.rb +65 -126
  33. data/lib/coderay/scanner.rb +173 -156
  34. data/lib/coderay/scanners/_map.rb +18 -17
  35. data/lib/coderay/scanners/c.rb +63 -77
  36. data/lib/coderay/scanners/clojure.rb +217 -0
  37. data/lib/coderay/scanners/cpp.rb +71 -84
  38. data/lib/coderay/scanners/css.rb +103 -120
  39. data/lib/coderay/scanners/debug.rb +47 -44
  40. data/lib/coderay/scanners/delphi.rb +70 -76
  41. data/lib/coderay/scanners/diff.rb +141 -50
  42. data/lib/coderay/scanners/erb.rb +81 -0
  43. data/lib/coderay/scanners/groovy.rb +104 -113
  44. data/lib/coderay/scanners/haml.rb +168 -0
  45. data/lib/coderay/scanners/html.rb +181 -110
  46. data/lib/coderay/scanners/java.rb +73 -75
  47. data/lib/coderay/scanners/java/builtin_types.rb +2 -0
  48. data/lib/coderay/scanners/java_script.rb +90 -101
  49. data/lib/coderay/scanners/json.rb +40 -53
  50. data/lib/coderay/scanners/php.rb +123 -147
  51. data/lib/coderay/scanners/python.rb +93 -91
  52. data/lib/coderay/scanners/raydebug.rb +66 -0
  53. data/lib/coderay/scanners/ruby.rb +343 -326
  54. data/lib/coderay/scanners/ruby/patterns.rb +40 -106
  55. data/lib/coderay/scanners/ruby/string_state.rb +71 -0
  56. data/lib/coderay/scanners/sql.rb +80 -66
  57. data/lib/coderay/scanners/text.rb +26 -0
  58. data/lib/coderay/scanners/xml.rb +1 -1
  59. data/lib/coderay/scanners/yaml.rb +74 -73
  60. data/lib/coderay/style.rb +10 -7
  61. data/lib/coderay/styles/_map.rb +3 -3
  62. data/lib/coderay/styles/alpha.rb +143 -0
  63. data/lib/coderay/token_kinds.rb +90 -0
  64. data/lib/coderay/tokens.rb +102 -277
  65. data/lib/coderay/tokens_proxy.rb +55 -0
  66. data/lib/coderay/version.rb +3 -0
  67. data/test/functional/basic.rb +200 -18
  68. data/test/functional/examples.rb +130 -0
  69. data/test/functional/for_redcloth.rb +15 -8
  70. data/test/functional/suite.rb +9 -6
  71. metadata +103 -123
  72. data/FOLDERS +0 -53
  73. data/bin/coderay_stylesheet +0 -4
  74. data/lib/coderay/encoders/html/numerization.rb +0 -133
  75. data/lib/coderay/encoders/term.rb +0 -158
  76. data/lib/coderay/encoders/token_class_filter.rb +0 -84
  77. data/lib/coderay/helpers/gzip_simple.rb +0 -123
  78. data/lib/coderay/scanners/nitro_xhtml.rb +0 -136
  79. data/lib/coderay/scanners/plaintext.rb +0 -20
  80. data/lib/coderay/scanners/rhtml.rb +0 -78
  81. data/lib/coderay/scanners/scheme.rb +0 -145
  82. data/lib/coderay/styles/cycnus.rb +0 -152
  83. data/lib/coderay/styles/murphy.rb +0 -134
  84. data/lib/coderay/token_classes.rb +0 -86
  85. data/test/functional/load_plugin_scanner.rb +0 -11
  86. data/test/functional/vhdl.rb +0 -126
  87. data/test/functional/word_list.rb +0 -79
@@ -3,14 +3,19 @@ module Scanners
3
3
 
4
4
  load :html
5
5
 
6
+ # Scanner for PHP.
7
+ #
6
8
  # Original by Stefan Walk.
7
9
  class PHP < Scanner
8
10
 
9
11
  register_for :php
10
12
  file_extension 'php'
13
+ encoding 'BINARY'
11
14
 
12
15
  KINDS_NOT_LOC = HTML::KINDS_NOT_LOC
13
16
 
17
+ protected
18
+
14
19
  def setup
15
20
  @html_scanner = CodeRay.scanner :html, :tokens => @tokens, :keep_tokens => true, :keep_state => true
16
21
  end
@@ -20,7 +25,7 @@ module Scanners
20
25
  @html_scanner.reset
21
26
  end
22
27
 
23
- module Words
28
+ module Words # :nodoc:
24
29
 
25
30
  # according to http://www.php.net/manual/en/reserved.keywords.php
26
31
  KEYWORDS = %w[
@@ -176,20 +181,20 @@ module Scanners
176
181
  $argc $argv
177
182
  ]
178
183
 
179
- IDENT_KIND = CaseIgnoringWordList.new(:ident).
180
- add(KEYWORDS, :reserved).
181
- add(TYPES, :pre_type).
182
- add(LANGUAGE_CONSTRUCTS, :reserved).
184
+ IDENT_KIND = WordList::CaseIgnoring.new(:ident).
185
+ add(KEYWORDS, :keyword).
186
+ add(TYPES, :predefined_type).
187
+ add(LANGUAGE_CONSTRUCTS, :keyword).
183
188
  add(BUILTIN_FUNCTIONS, :predefined).
184
- add(CLASSES, :pre_constant).
189
+ add(CLASSES, :predefined_constant).
185
190
  add(EXCEPTIONS, :exception).
186
- add(CONSTANTS, :pre_constant)
191
+ add(CONSTANTS, :predefined_constant)
187
192
 
188
193
  VARIABLE_KIND = WordList.new(:local_variable).
189
194
  add(PREDEFINED, :predefined)
190
195
  end
191
196
 
192
- module RE
197
+ module RE # :nodoc:
193
198
 
194
199
  PHP_START = /
195
200
  <script\s+[^>]*?language\s*=\s*"php"[^>]*?> |
@@ -224,17 +229,13 @@ module Scanners
224
229
 
225
230
  end
226
231
 
227
- def scan_tokens tokens, options
228
- if string.respond_to?(:encoding)
229
- unless string.encoding == Encoding::ASCII_8BIT
230
- self.string = string.encode Encoding::ASCII_8BIT,
231
- :invalid => :replace, :undef => :replace, :replace => '?'
232
- end
233
- end
232
+ protected
233
+
234
+ def scan_tokens encoder, options
234
235
 
235
236
  if check(RE::PHP_START) || # starts with <?
236
- (match?(/\s*<\S/) && exist?(RE::PHP_START)) || # starts with tag and contains <?
237
- exist?(RE::HTML_INDICATOR) ||
237
+ (match?(/\s*<\S/) && check(/.{1,1000}#{RE::PHP_START}/om)) || # starts with tag and contains <?
238
+ check(/.{0,1000}#{RE::HTML_INDICATOR}/om) ||
238
239
  check(/.{1,100}#{RE::PHP_START}/om) # PHP start after max 100 chars
239
240
  # is HTML with embedded PHP, so start with HTML
240
241
  states = [:initial]
@@ -252,29 +253,24 @@ module Scanners
252
253
 
253
254
  until eos?
254
255
 
255
- match = nil
256
- kind = nil
257
-
258
256
  case states.last
259
257
 
260
258
  when :initial # HTML
261
- if scan RE::PHP_START
262
- kind = :inline_delimiter
259
+ if match = scan(RE::PHP_START)
260
+ encoder.text_token match, :inline_delimiter
263
261
  label_expected = true
264
262
  states << :php
265
263
  else
266
264
  match = scan_until(/(?=#{RE::PHP_START})/o) || scan_rest
267
265
  @html_scanner.tokenize match unless match.empty?
268
- next
269
266
  end
270
267
 
271
268
  when :php
272
269
  if match = scan(/\s+/)
273
- tokens << [match, :space]
274
- next
270
+ encoder.text_token match, :space
275
271
 
276
- elsif scan(%r! (?m: \/\* (?: .*? \*\/ | .* ) ) | (?://|\#) .*? (?=#{RE::PHP_END}|$) !xo)
277
- kind = :comment
272
+ elsif match = scan(%r! (?m: \/\* (?: .*? \*\/ | .* ) ) | (?://|\#) .*? (?=#{RE::PHP_END}|$) !xo)
273
+ encoder.text_token match, :comment
278
274
 
279
275
  elsif match = scan(RE::IDENTIFIER)
280
276
  kind = Words::IDENT_KIND[match]
@@ -285,7 +281,7 @@ module Scanners
285
281
  label_expected = false
286
282
  if kind == :ident && match =~ /^[A-Z]/
287
283
  kind = :constant
288
- elsif kind == :reserved
284
+ elsif kind == :keyword
289
285
  case match
290
286
  when 'class'
291
287
  states << :class_expected
@@ -299,77 +295,68 @@ module Scanners
299
295
  next
300
296
  end
301
297
  end
298
+ encoder.text_token match, kind
302
299
 
303
- elsif scan(/(?:\d+\.\d*|\d*\.\d+)(?:e[-+]?\d+)?|\d+e[-+]?\d+/i)
300
+ elsif match = scan(/(?:\d+\.\d*|\d*\.\d+)(?:e[-+]?\d+)?|\d+e[-+]?\d+/i)
304
301
  label_expected = false
305
- kind = :float
302
+ encoder.text_token match, :float
306
303
 
307
- elsif scan(/0x[0-9a-fA-F]+/)
304
+ elsif match = scan(/0x[0-9a-fA-F]+/)
308
305
  label_expected = false
309
- kind = :hex
306
+ encoder.text_token match, :hex
310
307
 
311
- elsif scan(/\d+/)
308
+ elsif match = scan(/\d+/)
312
309
  label_expected = false
313
- kind = :integer
314
-
315
- elsif scan(/'/)
316
- tokens << [:open, :string]
317
- if modifier
318
- tokens << [modifier, :modifier]
319
- modifier = nil
320
- end
321
- kind = :delimiter
322
- states.push :sqstring
310
+ encoder.text_token match, :integer
323
311
 
324
- elsif match = scan(/["`]/)
325
- tokens << [:open, :string]
312
+ elsif match = scan(/['"`]/)
313
+ encoder.begin_group :string
326
314
  if modifier
327
- tokens << [modifier, :modifier]
315
+ encoder.text_token modifier, :modifier
328
316
  modifier = nil
329
317
  end
330
318
  delimiter = match
331
- kind = :delimiter
332
- states.push :dqstring
319
+ encoder.text_token match, :delimiter
320
+ states.push match == "'" ? :sqstring : :dqstring
333
321
 
334
322
  elsif match = scan(RE::VARIABLE)
335
323
  label_expected = false
336
- kind = Words::VARIABLE_KIND[match]
324
+ encoder.text_token match, Words::VARIABLE_KIND[match]
337
325
 
338
- elsif scan(/\{/)
339
- kind = :operator
326
+ elsif match = scan(/\{/)
327
+ encoder.text_token match, :operator
340
328
  label_expected = true
341
329
  states.push :php
342
330
 
343
- elsif scan(/\}/)
331
+ elsif match = scan(/\}/)
344
332
  if states.size == 1
345
- kind = :error
333
+ encoder.text_token match, :error
346
334
  else
347
335
  states.pop
348
336
  if states.last.is_a?(::Array)
349
337
  delimiter = states.last[1]
350
338
  states[-1] = states.last[0]
351
- tokens << [matched, :delimiter]
352
- tokens << [:close, :inline]
353
- next
339
+ encoder.text_token match, :delimiter
340
+ encoder.end_group :inline
354
341
  else
355
- kind = :operator
342
+ encoder.text_token match, :operator
356
343
  label_expected = true
357
344
  end
358
345
  end
359
346
 
360
- elsif scan(/@/)
347
+ elsif match = scan(/@/)
361
348
  label_expected = false
362
- kind = :exception
349
+ encoder.text_token match, :exception
363
350
 
364
- elsif scan RE::PHP_END
365
- kind = :inline_delimiter
351
+ elsif match = scan(RE::PHP_END)
352
+ encoder.text_token match, :inline_delimiter
366
353
  states = [:initial]
367
354
 
368
355
  elsif match = scan(/<<<(?:(#{RE::IDENTIFIER})|"(#{RE::IDENTIFIER})"|'(#{RE::IDENTIFIER})')/o)
369
- tokens << [:open, :string]
370
- warn 'heredoc in heredoc?' if heredoc_delimiter
356
+ encoder.begin_group :string
357
+ # warn 'heredoc in heredoc?' if heredoc_delimiter
371
358
  heredoc_delimiter = Regexp.escape(self[1] || self[2] || self[3])
372
- kind = :delimiter
359
+ encoder.text_token match, :delimiter
373
360
  states.push self[3] ? :sqstring : :dqstring
374
361
  heredoc_delimiter = /#{heredoc_delimiter}(?=;?$)/
375
362
 
@@ -379,152 +366,141 @@ module Scanners
379
366
  label_expected = true if match == ':'
380
367
  case_expected = false
381
368
  end
382
- kind = :operator
369
+ encoder.text_token match, :operator
383
370
 
384
371
  else
385
- getch
386
- kind = :error
372
+ encoder.text_token getch, :error
387
373
 
388
374
  end
389
375
 
390
376
  when :sqstring
391
- if scan(heredoc_delimiter ? /[^\\\n]+/ : /[^'\\]+/)
392
- kind = :content
393
- elsif !heredoc_delimiter && scan(/'/)
394
- tokens << [matched, :delimiter]
395
- tokens << [:close, :string]
377
+ if match = scan(heredoc_delimiter ? /[^\\\n]+/ : /[^'\\]+/)
378
+ encoder.text_token match, :content
379
+ elsif !heredoc_delimiter && match = scan(/'/)
380
+ encoder.text_token match, :delimiter
381
+ encoder.end_group :string
396
382
  delimiter = nil
397
383
  label_expected = false
398
384
  states.pop
399
- next
400
385
  elsif heredoc_delimiter && match = scan(/\n/)
401
- kind = :content
402
386
  if scan heredoc_delimiter
403
- tokens << ["\n", :content]
404
- tokens << [matched, :delimiter]
405
- tokens << [:close, :string]
387
+ encoder.text_token "\n", :content
388
+ encoder.text_token matched, :delimiter
389
+ encoder.end_group :string
406
390
  heredoc_delimiter = nil
407
391
  label_expected = false
408
392
  states.pop
409
- next
393
+ else
394
+ encoder.text_token match, :content
410
395
  end
411
- elsif scan(heredoc_delimiter ? /\\\\/ : /\\[\\'\n]/)
412
- kind = :char
413
- elsif scan(/\\./m)
414
- kind = :content
415
- elsif scan(/\\/)
416
- kind = :error
396
+ elsif match = scan(heredoc_delimiter ? /\\\\/ : /\\[\\'\n]/)
397
+ encoder.text_token match, :char
398
+ elsif match = scan(/\\./m)
399
+ encoder.text_token match, :content
400
+ elsif match = scan(/\\/)
401
+ encoder.text_token match, :error
402
+ else
403
+ states.pop
417
404
  end
418
405
 
419
406
  when :dqstring
420
- if scan(heredoc_delimiter ? /[^${\\\n]+/ : (delimiter == '"' ? /[^"${\\]+/ : /[^`${\\]+/))
421
- kind = :content
422
- elsif !heredoc_delimiter && scan(delimiter == '"' ? /"/ : /`/)
423
- tokens << [matched, :delimiter]
424
- tokens << [:close, :string]
407
+ if match = scan(heredoc_delimiter ? /[^${\\\n]+/ : (delimiter == '"' ? /[^"${\\]+/ : /[^`${\\]+/))
408
+ encoder.text_token match, :content
409
+ elsif !heredoc_delimiter && match = scan(delimiter == '"' ? /"/ : /`/)
410
+ encoder.text_token match, :delimiter
411
+ encoder.end_group :string
425
412
  delimiter = nil
426
413
  label_expected = false
427
414
  states.pop
428
- next
429
415
  elsif heredoc_delimiter && match = scan(/\n/)
430
- kind = :content
431
416
  if scan heredoc_delimiter
432
- tokens << ["\n", :content]
433
- tokens << [matched, :delimiter]
434
- tokens << [:close, :string]
417
+ encoder.text_token "\n", :content
418
+ encoder.text_token matched, :delimiter
419
+ encoder.end_group :string
435
420
  heredoc_delimiter = nil
436
421
  label_expected = false
437
422
  states.pop
438
- next
423
+ else
424
+ encoder.text_token match, :content
439
425
  end
440
- elsif scan(/\\(?:x[0-9A-Fa-f]{1,2}|[0-7]{1,3})/)
441
- kind = :char
442
- elsif scan(heredoc_delimiter ? /\\[nrtvf\\$]/ : (delimiter == '"' ? /\\[nrtvf\\$"]/ : /\\[nrtvf\\$`]/))
443
- kind = :char
444
- elsif scan(/\\./m)
445
- kind = :content
446
- elsif scan(/\\/)
447
- kind = :error
426
+ elsif match = scan(/\\(?:x[0-9A-Fa-f]{1,2}|[0-7]{1,3})/)
427
+ encoder.text_token match, :char
428
+ elsif match = scan(heredoc_delimiter ? /\\[nrtvf\\$]/ : (delimiter == '"' ? /\\[nrtvf\\$"]/ : /\\[nrtvf\\$`]/))
429
+ encoder.text_token match, :char
430
+ elsif match = scan(/\\./m)
431
+ encoder.text_token match, :content
432
+ elsif match = scan(/\\/)
433
+ encoder.text_token match, :error
448
434
  elsif match = scan(/#{RE::VARIABLE}/o)
449
- kind = :local_variable
450
435
  if check(/\[#{RE::IDENTIFIER}\]/o)
451
- tokens << [:open, :inline]
452
- tokens << [match, :local_variable]
453
- tokens << [scan(/\[/), :operator]
454
- tokens << [scan(/#{RE::IDENTIFIER}/o), :ident]
455
- tokens << [scan(/\]/), :operator]
456
- tokens << [:close, :inline]
457
- next
436
+ encoder.begin_group :inline
437
+ encoder.text_token match, :local_variable
438
+ encoder.text_token scan(/\[/), :operator
439
+ encoder.text_token scan(/#{RE::IDENTIFIER}/o), :ident
440
+ encoder.text_token scan(/\]/), :operator
441
+ encoder.end_group :inline
458
442
  elsif check(/\[/)
459
443
  match << scan(/\[['"]?#{RE::IDENTIFIER}?['"]?\]?/o)
460
- kind = :error
444
+ encoder.text_token match, :error
461
445
  elsif check(/->#{RE::IDENTIFIER}/o)
462
- tokens << [:open, :inline]
463
- tokens << [match, :local_variable]
464
- tokens << [scan(/->/), :operator]
465
- tokens << [scan(/#{RE::IDENTIFIER}/o), :ident]
466
- tokens << [:close, :inline]
467
- next
446
+ encoder.begin_group :inline
447
+ encoder.text_token match, :local_variable
448
+ encoder.text_token scan(/->/), :operator
449
+ encoder.text_token scan(/#{RE::IDENTIFIER}/o), :ident
450
+ encoder.end_group :inline
468
451
  elsif check(/->/)
469
452
  match << scan(/->/)
470
- kind = :error
453
+ encoder.text_token match, :error
454
+ else
455
+ encoder.text_token match, :local_variable
471
456
  end
472
457
  elsif match = scan(/\{/)
473
458
  if check(/\$/)
474
- kind = :delimiter
459
+ encoder.begin_group :inline
475
460
  states[-1] = [states.last, delimiter]
476
461
  delimiter = nil
477
462
  states.push :php
478
- tokens << [:open, :inline]
463
+ encoder.text_token match, :delimiter
479
464
  else
480
- kind = :string
465
+ encoder.text_token match, :content
481
466
  end
482
- elsif scan(/\$\{#{RE::IDENTIFIER}\}/o)
483
- kind = :local_variable
484
- elsif scan(/\$/)
485
- kind = :content
467
+ elsif match = scan(/\$\{#{RE::IDENTIFIER}\}/o)
468
+ encoder.text_token match, :local_variable
469
+ elsif match = scan(/\$/)
470
+ encoder.text_token match, :content
471
+ else
472
+ states.pop
486
473
  end
487
474
 
488
475
  when :class_expected
489
- if scan(/\s+/)
490
- kind = :space
476
+ if match = scan(/\s+/)
477
+ encoder.text_token match, :space
491
478
  elsif match = scan(/#{RE::IDENTIFIER}/o)
492
- kind = :class
479
+ encoder.text_token match, :class
493
480
  states.pop
494
481
  else
495
482
  states.pop
496
- next
497
483
  end
498
484
 
499
485
  when :function_expected
500
- if scan(/\s+/)
501
- kind = :space
502
- elsif scan(/&/)
503
- kind = :operator
486
+ if match = scan(/\s+/)
487
+ encoder.text_token match, :space
488
+ elsif match = scan(/&/)
489
+ encoder.text_token match, :operator
504
490
  elsif match = scan(/#{RE::IDENTIFIER}/o)
505
- kind = :function
491
+ encoder.text_token match, :function
506
492
  states.pop
507
493
  else
508
494
  states.pop
509
- next
510
495
  end
511
496
 
512
497
  else
513
- raise_inspect 'Unknown state!', tokens, states
498
+ raise_inspect 'Unknown state!', encoder, states
514
499
  end
515
500
 
516
- match ||= matched
517
- if $CODERAY_DEBUG and not kind
518
- raise_inspect 'Error token %p in line %d' %
519
- [[match, kind], line], tokens, states
520
- end
521
- raise_inspect 'Empty token', tokens, states unless match
522
-
523
- tokens << [match, kind]
524
-
525
501
  end
526
502
 
527
- tokens
503
+ encoder
528
504
  end
529
505
 
530
506
  end
@@ -1,12 +1,12 @@
1
1
  module CodeRay
2
2
  module Scanners
3
3
 
4
- # Bases on pygments' PythonLexer, see
4
+ # Scanner for Python. Supports Python 3.
5
+ #
6
+ # Based on pygments' PythonLexer, see
5
7
  # http://dev.pocoo.org/projects/pygments/browser/pygments/lexers/agile.py.
6
8
  class Python < Scanner
7
9
 
8
- include Streamable
9
-
10
10
  register_for :python
11
11
  file_extension 'py'
12
12
 
@@ -16,11 +16,11 @@ module Scanners
16
16
  'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'not',
17
17
  'or', 'pass', 'raise', 'return', 'try', 'while', 'with', 'yield',
18
18
  'nonlocal', # new in Python 3
19
- ]
19
+ ] # :nodoc:
20
20
 
21
21
  OLD_KEYWORDS = [
22
22
  'exec', 'print', # gone in Python 3
23
- ]
23
+ ] # :nodoc:
24
24
 
25
25
  PREDEFINED_METHODS_AND_TYPES = %w[
26
26
  __import__ abs all any apply basestring bin bool buffer
@@ -32,7 +32,7 @@ module Scanners
32
32
  raw_input reduce reload repr reversed round set setattr slice
33
33
  sorted staticmethod str sum super tuple type unichr unicode
34
34
  vars xrange zip
35
- ]
35
+ ] # :nodoc:
36
36
 
37
37
  PREDEFINED_EXCEPTIONS = %w[
38
38
  ArithmeticError AssertionError AttributeError
@@ -47,23 +47,23 @@ module Scanners
47
47
  TypeError UnboundLocalError UnicodeDecodeError
48
48
  UnicodeEncodeError UnicodeError UnicodeTranslateError
49
49
  UnicodeWarning UserWarning ValueError Warning ZeroDivisionError
50
- ]
50
+ ] # :nodoc:
51
51
 
52
52
  PREDEFINED_VARIABLES_AND_CONSTANTS = [
53
- 'False', 'True', 'None', # "keywords" since Python 3
53
+ 'False', 'True', 'None', # "keywords" since Python 3
54
54
  'self', 'Ellipsis', 'NotImplemented',
55
- ]
55
+ ] # :nodoc:
56
56
 
57
57
  IDENT_KIND = WordList.new(:ident).
58
58
  add(KEYWORDS, :keyword).
59
59
  add(OLD_KEYWORDS, :old_keyword).
60
60
  add(PREDEFINED_METHODS_AND_TYPES, :predefined).
61
- add(PREDEFINED_VARIABLES_AND_CONSTANTS, :pre_constant).
62
- add(PREDEFINED_EXCEPTIONS, :exception)
61
+ add(PREDEFINED_VARIABLES_AND_CONSTANTS, :predefined_constant).
62
+ add(PREDEFINED_EXCEPTIONS, :exception) # :nodoc:
63
63
 
64
- NAME = / [^\W\d] \w* /x
65
- ESCAPE = / [abfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x
66
- UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} | N\{[-\w ]+\} /x
64
+ NAME = / [^\W\d] \w* /x # :nodoc:
65
+ ESCAPE = / [abfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc:
66
+ UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} | N\{[-\w ]+\} /x # :nodoc:
67
67
 
68
68
  OPERATOR = /
69
69
  \.\.\. | # ellipsis
@@ -73,95 +73,103 @@ module Scanners
73
73
  [-+*\/%&|^]=? | # ordinary math and binary logic
74
74
  [~`] | # binary complement and inspection
75
75
  <<=? | >>=? | [<>=]=? | != # comparison and assignment
76
- /x
76
+ /x # :nodoc:
77
77
 
78
- STRING_DELIMITER_REGEXP = Hash.new do |h, delimiter|
79
- h[delimiter] = Regexp.union delimiter
80
- end
78
+ STRING_DELIMITER_REGEXP = Hash.new { |h, delimiter|
79
+ h[delimiter] = Regexp.union delimiter # :nodoc:
80
+ }
81
81
 
82
- STRING_CONTENT_REGEXP = Hash.new do |h, delimiter|
83
- h[delimiter] = / [^\\\n]+? (?= \\ | $ | #{Regexp.escape(delimiter)} ) /x
84
- end
82
+ STRING_CONTENT_REGEXP = Hash.new { |h, delimiter|
83
+ h[delimiter] = / [^\\\n]+? (?= \\ | $ | #{Regexp.escape(delimiter)} ) /x # :nodoc:
84
+ }
85
85
 
86
86
  DEF_NEW_STATE = WordList.new(:initial).
87
87
  add(%w(def), :def_expected).
88
88
  add(%w(import from), :include_expected).
89
- add(%w(class), :class_expected)
89
+ add(%w(class), :class_expected) # :nodoc:
90
90
 
91
91
  DESCRIPTOR = /
92
92
  #{NAME}
93
93
  (?: \. #{NAME} )*
94
94
  | \*
95
- /x
95
+ /x # :nodoc:
96
+
97
+ DOCSTRING_COMING = /
98
+ [ \t]* u?r? ("""|''')
99
+ /x # :nodoc:
96
100
 
97
- def scan_tokens tokens, options
101
+ protected
102
+
103
+ def scan_tokens encoder, options
98
104
 
99
105
  state = :initial
100
106
  string_delimiter = nil
101
107
  string_raw = false
108
+ string_type = nil
109
+ docstring_coming = match?(/#{DOCSTRING_COMING}/o)
102
110
  last_token_dot = false
103
111
  unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8'
104
112
  from_import_state = []
105
113
 
106
114
  until eos?
107
115
 
108
- kind = nil
109
- match = nil
110
-
111
116
  if state == :string
112
- if scan(STRING_DELIMITER_REGEXP[string_delimiter])
113
- tokens << [matched, :delimiter]
114
- tokens << [:close, :string]
117
+ if match = scan(STRING_DELIMITER_REGEXP[string_delimiter])
118
+ encoder.text_token match, :delimiter
119
+ encoder.end_group string_type
120
+ string_type = nil
115
121
  state = :initial
116
122
  next
117
- elsif string_delimiter.size == 3 && scan(/\n/)
118
- kind = :content
119
- elsif scan(STRING_CONTENT_REGEXP[string_delimiter])
120
- kind = :content
121
- elsif !string_raw && scan(/ \\ #{ESCAPE} /ox)
122
- kind = :char
123
- elsif scan(/ \\ #{UNICODE_ESCAPE} /ox)
124
- kind = :char
125
- elsif scan(/ \\ . /x)
126
- kind = :content
127
- elsif scan(/ \\ | $ /x)
128
- tokens << [:close, :string]
129
- kind = :error
123
+ elsif string_delimiter.size == 3 && match = scan(/\n/)
124
+ encoder.text_token match, :content
125
+ elsif match = scan(STRING_CONTENT_REGEXP[string_delimiter])
126
+ encoder.text_token match, :content
127
+ elsif !string_raw && match = scan(/ \\ #{ESCAPE} /ox)
128
+ encoder.text_token match, :char
129
+ elsif match = scan(/ \\ #{UNICODE_ESCAPE} /ox)
130
+ encoder.text_token match, :char
131
+ elsif match = scan(/ \\ . /x)
132
+ encoder.text_token match, :content
133
+ elsif match = scan(/ \\ | $ /x)
134
+ encoder.end_group string_type
135
+ string_type = nil
136
+ encoder.text_token match, :error
130
137
  state = :initial
131
138
  else
132
- raise_inspect "else case \" reached; %p not handled." % peek(1), tokens, state
139
+ raise_inspect "else case \" reached; %p not handled." % peek(1), encoder, state
133
140
  end
134
141
 
135
- elsif match = scan(/ [ \t]+ | \\\n /x)
136
- tokens << [match, :space]
137
- next
138
-
139
- elsif match = scan(/\n/)
140
- tokens << [match, :space]
141
- state = :initial if state == :include_expected
142
+ elsif match = scan(/ [ \t]+ | \\?\n /x)
143
+ encoder.text_token match, :space
144
+ if match == "\n"
145
+ state = :initial if state == :include_expected
146
+ docstring_coming = true if match?(/#{DOCSTRING_COMING}/o)
147
+ end
142
148
  next
143
149
 
144
150
  elsif match = scan(/ \# [^\n]* /mx)
145
- tokens << [match, :comment]
151
+ encoder.text_token match, :comment
146
152
  next
147
153
 
148
154
  elsif state == :initial
149
155
 
150
- if scan(/#{OPERATOR}/o)
151
- kind = :operator
156
+ if match = scan(/#{OPERATOR}/o)
157
+ encoder.text_token match, :operator
152
158
 
153
159
  elsif match = scan(/(u?r?|b)?("""|"|'''|')/i)
154
- tokens << [:open, :string]
155
160
  string_delimiter = self[2]
161
+ string_type = docstring_coming ? :docstring : :string
162
+ docstring_coming = false if docstring_coming
163
+ encoder.begin_group string_type
156
164
  string_raw = false
157
165
  modifiers = self[1]
158
166
  unless modifiers.empty?
159
167
  string_raw = !!modifiers.index(?r)
160
- tokens << [modifiers, :modifier]
168
+ encoder.text_token modifiers, :modifier
161
169
  match = string_delimiter
162
170
  end
163
171
  state = :string
164
- kind = :delimiter
172
+ encoder.text_token match, :delimiter
165
173
 
166
174
  # TODO: backticks
167
175
 
@@ -177,43 +185,45 @@ module Scanners
177
185
  state = DEF_NEW_STATE[match]
178
186
  from_import_state << match.to_sym if state == :include_expected
179
187
  end
188
+ encoder.text_token match, kind
180
189
 
181
- elsif scan(/@[a-zA-Z0-9_.]+[lL]?/)
182
- kind = :decorator
190
+ elsif match = scan(/@[a-zA-Z0-9_.]+[lL]?/)
191
+ encoder.text_token match, :decorator
183
192
 
184
- elsif scan(/0[xX][0-9A-Fa-f]+[lL]?/)
185
- kind = :hex
193
+ elsif match = scan(/0[xX][0-9A-Fa-f]+[lL]?/)
194
+ encoder.text_token match, :hex
186
195
 
187
- elsif scan(/0[bB][01]+[lL]?/)
188
- kind = :bin
196
+ elsif match = scan(/0[bB][01]+[lL]?/)
197
+ encoder.text_token match, :binary
189
198
 
190
199
  elsif match = scan(/(?:\d*\.\d+|\d+\.\d*)(?:[eE][+-]?\d+)?|\d+[eE][+-]?\d+/)
191
- kind = :float
192
200
  if scan(/[jJ]/)
193
201
  match << matched
194
- kind = :imaginary
202
+ encoder.text_token match, :imaginary
203
+ else
204
+ encoder.text_token match, :float
195
205
  end
196
206
 
197
- elsif scan(/0[oO][0-7]+|0[0-7]+(?![89.eE])[lL]?/)
198
- kind = :oct
207
+ elsif match = scan(/0[oO][0-7]+|0[0-7]+(?![89.eE])[lL]?/)
208
+ encoder.text_token match, :octal
199
209
 
200
210
  elsif match = scan(/\d+([lL])?/)
201
- kind = :integer
202
211
  if self[1] == nil && scan(/[jJ]/)
203
212
  match << matched
204
- kind = :imaginary
213
+ encoder.text_token match, :imaginary
214
+ else
215
+ encoder.text_token match, :integer
205
216
  end
206
217
 
207
218
  else
208
- getch
209
- kind = :error
219
+ encoder.text_token getch, :error
210
220
 
211
221
  end
212
222
 
213
223
  elsif state == :def_expected
214
224
  state = :initial
215
225
  if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
216
- kind = :method
226
+ encoder.text_token match, :method
217
227
  else
218
228
  next
219
229
  end
@@ -221,33 +231,34 @@ module Scanners
221
231
  elsif state == :class_expected
222
232
  state = :initial
223
233
  if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
224
- kind = :class
234
+ encoder.text_token match, :class
225
235
  else
226
236
  next
227
237
  end
228
238
 
229
239
  elsif state == :include_expected
230
240
  if match = scan(unicode ? /#{DESCRIPTOR}/uo : /#{DESCRIPTOR}/o)
231
- kind = :include
232
241
  if match == 'as'
233
- kind = :keyword
242
+ encoder.text_token match, :keyword
234
243
  from_import_state << :as
235
244
  elsif from_import_state.first == :from && match == 'import'
236
- kind = :keyword
245
+ encoder.text_token match, :keyword
237
246
  from_import_state << :import
238
247
  elsif from_import_state.last == :as
239
- # kind = match[0,1][unicode ? /[[:upper:]]/u : /[[:upper:]]/] ? :class : :method
240
- kind = :ident
248
+ # encoder.text_token match, match[0,1][unicode ? /[[:upper:]]/u : /[[:upper:]]/] ? :class : :method
249
+ encoder.text_token match, :ident
241
250
  from_import_state.pop
242
251
  elsif IDENT_KIND[match] == :keyword
243
252
  unscan
244
253
  match = nil
245
254
  state = :initial
246
255
  next
256
+ else
257
+ encoder.text_token match, :include
247
258
  end
248
259
  elsif match = scan(/,/)
249
260
  from_import_state.pop if from_import_state.last == :as
250
- kind = :operator
261
+ encoder.text_token match, :operator
251
262
  else
252
263
  from_import_state = []
253
264
  state = :initial
@@ -255,28 +266,19 @@ module Scanners
255
266
  end
256
267
 
257
268
  else
258
- raise_inspect 'Unknown state', tokens, state
269
+ raise_inspect 'Unknown state', encoder, state
259
270
 
260
271
  end
261
272
 
262
- match ||= matched
263
- if $CODERAY_DEBUG and not kind
264
- raise_inspect 'Error token %p in line %d' %
265
- [[match, kind], line], tokens, state
266
- end
267
- raise_inspect 'Empty token', tokens, state unless match
268
-
269
273
  last_token_dot = match == '.'
270
274
 
271
- tokens << [match, kind]
272
-
273
275
  end
274
276
 
275
277
  if state == :string
276
- tokens << [:close, :string]
278
+ encoder.end_group string_type
277
279
  end
278
280
 
279
- tokens
281
+ encoder
280
282
  end
281
283
 
282
284
  end