pygmentize 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. data/LICENSE +19 -0
  2. data/lib/pygments.rb +23 -0
  3. data/pygmentize.gemspec +11 -0
  4. data/test/pygments.rb +19 -0
  5. data/vendor/pygmentize.py +7 -0
  6. data/vendor/pygments/AUTHORS +73 -0
  7. data/vendor/pygments/LICENSE +25 -0
  8. data/vendor/pygments/__init__.py +91 -0
  9. data/vendor/pygments/__init__.pyc +0 -0
  10. data/vendor/pygments/cmdline.py +430 -0
  11. data/vendor/pygments/cmdline.pyc +0 -0
  12. data/vendor/pygments/console.py +74 -0
  13. data/vendor/pygments/console.pyc +0 -0
  14. data/vendor/pygments/filter.py +74 -0
  15. data/vendor/pygments/filter.pyc +0 -0
  16. data/vendor/pygments/filters/__init__.py +357 -0
  17. data/vendor/pygments/filters/__init__.pyc +0 -0
  18. data/vendor/pygments/formatter.py +92 -0
  19. data/vendor/pygments/formatter.pyc +0 -0
  20. data/vendor/pygments/formatters/__init__.py +68 -0
  21. data/vendor/pygments/formatters/__init__.pyc +0 -0
  22. data/vendor/pygments/formatters/_mapping.py +92 -0
  23. data/vendor/pygments/formatters/_mapping.pyc +0 -0
  24. data/vendor/pygments/formatters/bbcode.py +109 -0
  25. data/vendor/pygments/formatters/bbcode.pyc +0 -0
  26. data/vendor/pygments/formatters/html.py +723 -0
  27. data/vendor/pygments/formatters/html.pyc +0 -0
  28. data/vendor/pygments/formatters/img.py +553 -0
  29. data/vendor/pygments/formatters/img.pyc +0 -0
  30. data/vendor/pygments/formatters/latex.py +354 -0
  31. data/vendor/pygments/formatters/latex.pyc +0 -0
  32. data/vendor/pygments/formatters/other.py +117 -0
  33. data/vendor/pygments/formatters/other.pyc +0 -0
  34. data/vendor/pygments/formatters/rtf.py +136 -0
  35. data/vendor/pygments/formatters/rtf.pyc +0 -0
  36. data/vendor/pygments/formatters/svg.py +154 -0
  37. data/vendor/pygments/formatters/svg.pyc +0 -0
  38. data/vendor/pygments/formatters/terminal.py +109 -0
  39. data/vendor/pygments/formatters/terminal.pyc +0 -0
  40. data/vendor/pygments/formatters/terminal256.py +219 -0
  41. data/vendor/pygments/formatters/terminal256.pyc +0 -0
  42. data/vendor/pygments/lexer.py +660 -0
  43. data/vendor/pygments/lexer.pyc +0 -0
  44. data/vendor/pygments/lexers/__init__.py +226 -0
  45. data/vendor/pygments/lexers/__init__.pyc +0 -0
  46. data/vendor/pygments/lexers/_asybuiltins.py +1645 -0
  47. data/vendor/pygments/lexers/_clbuiltins.py +232 -0
  48. data/vendor/pygments/lexers/_luabuiltins.py +256 -0
  49. data/vendor/pygments/lexers/_mapping.py +234 -0
  50. data/vendor/pygments/lexers/_mapping.pyc +0 -0
  51. data/vendor/pygments/lexers/_phpbuiltins.py +3389 -0
  52. data/vendor/pygments/lexers/_vimbuiltins.py +3 -0
  53. data/vendor/pygments/lexers/agile.py +1485 -0
  54. data/vendor/pygments/lexers/agile.pyc +0 -0
  55. data/vendor/pygments/lexers/asm.py +353 -0
  56. data/vendor/pygments/lexers/compiled.py +2365 -0
  57. data/vendor/pygments/lexers/dotnet.py +355 -0
  58. data/vendor/pygments/lexers/functional.py +756 -0
  59. data/vendor/pygments/lexers/functional.pyc +0 -0
  60. data/vendor/pygments/lexers/math.py +461 -0
  61. data/vendor/pygments/lexers/other.py +2297 -0
  62. data/vendor/pygments/lexers/parsers.py +695 -0
  63. data/vendor/pygments/lexers/special.py +100 -0
  64. data/vendor/pygments/lexers/special.pyc +0 -0
  65. data/vendor/pygments/lexers/templates.py +1387 -0
  66. data/vendor/pygments/lexers/text.py +1586 -0
  67. data/vendor/pygments/lexers/web.py +1619 -0
  68. data/vendor/pygments/lexers/web.pyc +0 -0
  69. data/vendor/pygments/plugin.py +74 -0
  70. data/vendor/pygments/plugin.pyc +0 -0
  71. data/vendor/pygments/scanner.py +104 -0
  72. data/vendor/pygments/style.py +117 -0
  73. data/vendor/pygments/style.pyc +0 -0
  74. data/vendor/pygments/styles/__init__.py +68 -0
  75. data/vendor/pygments/styles/__init__.pyc +0 -0
  76. data/vendor/pygments/styles/autumn.py +65 -0
  77. data/vendor/pygments/styles/borland.py +51 -0
  78. data/vendor/pygments/styles/bw.py +49 -0
  79. data/vendor/pygments/styles/colorful.py +81 -0
  80. data/vendor/pygments/styles/default.py +73 -0
  81. data/vendor/pygments/styles/default.pyc +0 -0
  82. data/vendor/pygments/styles/emacs.py +72 -0
  83. data/vendor/pygments/styles/friendly.py +72 -0
  84. data/vendor/pygments/styles/fruity.py +43 -0
  85. data/vendor/pygments/styles/manni.py +75 -0
  86. data/vendor/pygments/styles/monokai.py +106 -0
  87. data/vendor/pygments/styles/murphy.py +80 -0
  88. data/vendor/pygments/styles/native.py +65 -0
  89. data/vendor/pygments/styles/pastie.py +75 -0
  90. data/vendor/pygments/styles/perldoc.py +69 -0
  91. data/vendor/pygments/styles/tango.py +141 -0
  92. data/vendor/pygments/styles/trac.py +63 -0
  93. data/vendor/pygments/styles/vim.py +63 -0
  94. data/vendor/pygments/styles/vs.py +38 -0
  95. data/vendor/pygments/token.py +198 -0
  96. data/vendor/pygments/token.pyc +0 -0
  97. data/vendor/pygments/unistring.py +130 -0
  98. data/vendor/pygments/unistring.pyc +0 -0
  99. data/vendor/pygments/util.py +226 -0
  100. data/vendor/pygments/util.pyc +0 -0
  101. metadata +166 -0
@@ -0,0 +1,660 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ pygments.lexer
4
+ ~~~~~~~~~~~~~~
5
+
6
+ Base lexer classes.
7
+
8
+ :copyright: Copyright 2006-2010 by the Pygments team, see AUTHORS.
9
+ :license: BSD, see LICENSE for details.
10
+ """
11
+ import re
12
+
13
+ from pygments.filter import apply_filters, Filter
14
+ from pygments.filters import get_filter_by_name
15
+ from pygments.token import Error, Text, Other, _TokenType
16
+ from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
17
+ make_analysator
18
+
19
+
20
+ __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
21
+ 'LexerContext', 'include', 'bygroups', 'using', 'this']
22
+
23
+
24
+ _default_analyse = staticmethod(lambda x: 0.0)
25
+
26
+
27
+ class LexerMeta(type):
28
+ """
29
+ This metaclass automagically converts ``analyse_text`` methods into
30
+ static methods which always return float values.
31
+ """
32
+
33
+ def __new__(cls, name, bases, d):
34
+ if 'analyse_text' in d:
35
+ d['analyse_text'] = make_analysator(d['analyse_text'])
36
+ return type.__new__(cls, name, bases, d)
37
+
38
+
39
+ class Lexer(object):
40
+ """
41
+ Lexer for a specific language.
42
+
43
+ Basic options recognized:
44
+ ``stripnl``
45
+ Strip leading and trailing newlines from the input (default: True).
46
+ ``stripall``
47
+ Strip all leading and trailing whitespace from the input
48
+ (default: False).
49
+ ``ensurenl``
50
+ Make sure that the input ends with a newline (default: True). This
51
+ is required for some lexers that consume input linewise.
52
+ *New in Pygments 1.3.*
53
+ ``tabsize``
54
+ If given and greater than 0, expand tabs in the input (default: 0).
55
+ ``encoding``
56
+ If given, must be an encoding name. This encoding will be used to
57
+ convert the input string to Unicode, if it is not already a Unicode
58
+ string (default: ``'latin1'``).
59
+ Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or
60
+ ``'chardet'`` to use the chardet library, if it is installed.
61
+ """
62
+
63
+ #: Name of the lexer
64
+ name = None
65
+
66
+ #: Shortcuts for the lexer
67
+ aliases = []
68
+
69
+ #: fn match rules
70
+ filenames = []
71
+
72
+ #: fn alias filenames
73
+ alias_filenames = []
74
+
75
+ #: mime types
76
+ mimetypes = []
77
+
78
+ __metaclass__ = LexerMeta
79
+
80
+ def __init__(self, **options):
81
+ self.options = options
82
+ self.stripnl = get_bool_opt(options, 'stripnl', True)
83
+ self.stripall = get_bool_opt(options, 'stripall', False)
84
+ self.ensurenl = get_bool_opt(options, 'ensurenl', True)
85
+ self.tabsize = get_int_opt(options, 'tabsize', 0)
86
+ self.encoding = options.get('encoding', 'latin1')
87
+ # self.encoding = options.get('inencoding', None) or self.encoding
88
+ self.filters = []
89
+ for filter_ in get_list_opt(options, 'filters', ()):
90
+ self.add_filter(filter_)
91
+
92
+ def __repr__(self):
93
+ if self.options:
94
+ return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
95
+ self.options)
96
+ else:
97
+ return '<pygments.lexers.%s>' % self.__class__.__name__
98
+
99
+ def add_filter(self, filter_, **options):
100
+ """
101
+ Add a new stream filter to this lexer.
102
+ """
103
+ if not isinstance(filter_, Filter):
104
+ filter_ = get_filter_by_name(filter_, **options)
105
+ self.filters.append(filter_)
106
+
107
+ def analyse_text(text):
108
+ """
109
+ Has to return a float between ``0`` and ``1`` that indicates
110
+ if a lexer wants to highlight this text. Used by ``guess_lexer``.
111
+ If this method returns ``0`` it won't highlight it in any case, if
112
+ it returns ``1`` highlighting with this lexer is guaranteed.
113
+
114
+ The `LexerMeta` metaclass automatically wraps this function so
115
+ that it works like a static method (no ``self`` or ``cls``
116
+ parameter) and the return value is automatically converted to
117
+ `float`. If the return value is an object that is boolean `False`
118
+ it's the same as if the return values was ``0.0``.
119
+ """
120
+
121
+ def get_tokens(self, text, unfiltered=False):
122
+ """
123
+ Return an iterable of (tokentype, value) pairs generated from
124
+ `text`. If `unfiltered` is set to `True`, the filtering mechanism
125
+ is bypassed even if filters are defined.
126
+
127
+ Also preprocess the text, i.e. expand tabs and strip it if
128
+ wanted and applies registered filters.
129
+ """
130
+ if not isinstance(text, unicode):
131
+ if self.encoding == 'guess':
132
+ try:
133
+ text = text.decode('utf-8')
134
+ if text.startswith(u'\ufeff'):
135
+ text = text[len(u'\ufeff'):]
136
+ except UnicodeDecodeError:
137
+ text = text.decode('latin1')
138
+ elif self.encoding == 'chardet':
139
+ try:
140
+ import chardet
141
+ except ImportError:
142
+ raise ImportError('To enable chardet encoding guessing, '
143
+ 'please install the chardet library '
144
+ 'from http://chardet.feedparser.org/')
145
+ enc = chardet.detect(text)
146
+ text = text.decode(enc['encoding'])
147
+ else:
148
+ text = text.decode(self.encoding)
149
+ # text now *is* a unicode string
150
+ text = text.replace('\r\n', '\n')
151
+ text = text.replace('\r', '\n')
152
+ if self.stripall:
153
+ text = text.strip()
154
+ elif self.stripnl:
155
+ text = text.strip('\n')
156
+ if self.tabsize > 0:
157
+ text = text.expandtabs(self.tabsize)
158
+ if self.ensurenl and not text.endswith('\n'):
159
+ text += '\n'
160
+
161
+ def streamer():
162
+ for i, t, v in self.get_tokens_unprocessed(text):
163
+ yield t, v
164
+ stream = streamer()
165
+ if not unfiltered:
166
+ stream = apply_filters(stream, self.filters, self)
167
+ return stream
168
+
169
+ def get_tokens_unprocessed(self, text):
170
+ """
171
+ Return an iterable of (tokentype, value) pairs.
172
+ In subclasses, implement this method as a generator to
173
+ maximize effectiveness.
174
+ """
175
+ raise NotImplementedError
176
+
177
+
178
+ class DelegatingLexer(Lexer):
179
+ """
180
+ This lexer takes two lexer as arguments. A root lexer and
181
+ a language lexer. First everything is scanned using the language
182
+ lexer, afterwards all ``Other`` tokens are lexed using the root
183
+ lexer.
184
+
185
+ The lexers from the ``template`` lexer package use this base lexer.
186
+ """
187
+
188
+ def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
189
+ self.root_lexer = _root_lexer(**options)
190
+ self.language_lexer = _language_lexer(**options)
191
+ self.needle = _needle
192
+ Lexer.__init__(self, **options)
193
+
194
+ def get_tokens_unprocessed(self, text):
195
+ buffered = ''
196
+ insertions = []
197
+ lng_buffer = []
198
+ for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
199
+ if t is self.needle:
200
+ if lng_buffer:
201
+ insertions.append((len(buffered), lng_buffer))
202
+ lng_buffer = []
203
+ buffered += v
204
+ else:
205
+ lng_buffer.append((i, t, v))
206
+ if lng_buffer:
207
+ insertions.append((len(buffered), lng_buffer))
208
+ return do_insertions(insertions,
209
+ self.root_lexer.get_tokens_unprocessed(buffered))
210
+
211
+
212
+ #-------------------------------------------------------------------------------
213
+ # RegexLexer and ExtendedRegexLexer
214
+ #
215
+
216
+
217
+ class include(str):
218
+ """
219
+ Indicates that a state should include rules from another state.
220
+ """
221
+ pass
222
+
223
+
224
+ class combined(tuple):
225
+ """
226
+ Indicates a state combined from multiple states.
227
+ """
228
+
229
+ def __new__(cls, *args):
230
+ return tuple.__new__(cls, args)
231
+
232
+ def __init__(self, *args):
233
+ # tuple.__init__ doesn't do anything
234
+ pass
235
+
236
+
237
+ class _PseudoMatch(object):
238
+ """
239
+ A pseudo match object constructed from a string.
240
+ """
241
+
242
+ def __init__(self, start, text):
243
+ self._text = text
244
+ self._start = start
245
+
246
+ def start(self, arg=None):
247
+ return self._start
248
+
249
+ def end(self, arg=None):
250
+ return self._start + len(self._text)
251
+
252
+ def group(self, arg=None):
253
+ if arg:
254
+ raise IndexError('No such group')
255
+ return self._text
256
+
257
+ def groups(self):
258
+ return (self._text,)
259
+
260
+ def groupdict(self):
261
+ return {}
262
+
263
+
264
+ def bygroups(*args):
265
+ """
266
+ Callback that yields multiple actions for each group in the match.
267
+ """
268
+ def callback(lexer, match, ctx=None):
269
+ for i, action in enumerate(args):
270
+ if action is None:
271
+ continue
272
+ elif type(action) is _TokenType:
273
+ data = match.group(i + 1)
274
+ if data:
275
+ yield match.start(i + 1), action, data
276
+ else:
277
+ if ctx:
278
+ ctx.pos = match.start(i + 1)
279
+ for item in action(lexer, _PseudoMatch(match.start(i + 1),
280
+ match.group(i + 1)), ctx):
281
+ if item:
282
+ yield item
283
+ if ctx:
284
+ ctx.pos = match.end()
285
+ return callback
286
+
287
+
288
+ class _This(object):
289
+ """
290
+ Special singleton used for indicating the caller class.
291
+ Used by ``using``.
292
+ """
293
+ this = _This()
294
+
295
+
296
+ def using(_other, **kwargs):
297
+ """
298
+ Callback that processes the match with a different lexer.
299
+
300
+ The keyword arguments are forwarded to the lexer, except `state` which
301
+ is handled separately.
302
+
303
+ `state` specifies the state that the new lexer will start in, and can
304
+ be an enumerable such as ('root', 'inline', 'string') or a simple
305
+ string which is assumed to be on top of the root state.
306
+
307
+ Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
308
+ """
309
+ gt_kwargs = {}
310
+ if 'state' in kwargs:
311
+ s = kwargs.pop('state')
312
+ if isinstance(s, (list, tuple)):
313
+ gt_kwargs['stack'] = s
314
+ else:
315
+ gt_kwargs['stack'] = ('root', s)
316
+
317
+ if _other is this:
318
+ def callback(lexer, match, ctx=None):
319
+ # if keyword arguments are given the callback
320
+ # function has to create a new lexer instance
321
+ if kwargs:
322
+ # XXX: cache that somehow
323
+ kwargs.update(lexer.options)
324
+ lx = lexer.__class__(**kwargs)
325
+ else:
326
+ lx = lexer
327
+ s = match.start()
328
+ for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
329
+ yield i + s, t, v
330
+ if ctx:
331
+ ctx.pos = match.end()
332
+ else:
333
+ def callback(lexer, match, ctx=None):
334
+ # XXX: cache that somehow
335
+ kwargs.update(lexer.options)
336
+ lx = _other(**kwargs)
337
+
338
+ s = match.start()
339
+ for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
340
+ yield i + s, t, v
341
+ if ctx:
342
+ ctx.pos = match.end()
343
+ return callback
344
+
345
+
346
+ class RegexLexerMeta(LexerMeta):
347
+ """
348
+ Metaclass for RegexLexer, creates the self._tokens attribute from
349
+ self.tokens on the first instantiation.
350
+ """
351
+
352
+ def _process_state(cls, unprocessed, processed, state):
353
+ assert type(state) is str, "wrong state name %r" % state
354
+ assert state[0] != '#', "invalid state name %r" % state
355
+ if state in processed:
356
+ return processed[state]
357
+ tokens = processed[state] = []
358
+ rflags = cls.flags
359
+ for tdef in unprocessed[state]:
360
+ if isinstance(tdef, include):
361
+ # it's a state reference
362
+ assert tdef != state, "circular state reference %r" % state
363
+ tokens.extend(cls._process_state(unprocessed, processed, str(tdef)))
364
+ continue
365
+
366
+ assert type(tdef) is tuple, "wrong rule def %r" % tdef
367
+
368
+ try:
369
+ rex = re.compile(tdef[0], rflags).match
370
+ except Exception, err:
371
+ raise ValueError("uncompilable regex %r in state %r of %r: %s" %
372
+ (tdef[0], state, cls, err))
373
+
374
+ assert type(tdef[1]) is _TokenType or callable(tdef[1]), \
375
+ 'token type must be simple type or callable, not %r' % (tdef[1],)
376
+
377
+ if len(tdef) == 2:
378
+ new_state = None
379
+ else:
380
+ tdef2 = tdef[2]
381
+ if isinstance(tdef2, str):
382
+ # an existing state
383
+ if tdef2 == '#pop':
384
+ new_state = -1
385
+ elif tdef2 in unprocessed:
386
+ new_state = (tdef2,)
387
+ elif tdef2 == '#push':
388
+ new_state = tdef2
389
+ elif tdef2[:5] == '#pop:':
390
+ new_state = -int(tdef2[5:])
391
+ else:
392
+ assert False, 'unknown new state %r' % tdef2
393
+ elif isinstance(tdef2, combined):
394
+ # combine a new state from existing ones
395
+ new_state = '_tmp_%d' % cls._tmpname
396
+ cls._tmpname += 1
397
+ itokens = []
398
+ for istate in tdef2:
399
+ assert istate != state, 'circular state ref %r' % istate
400
+ itokens.extend(cls._process_state(unprocessed,
401
+ processed, istate))
402
+ processed[new_state] = itokens
403
+ new_state = (new_state,)
404
+ elif isinstance(tdef2, tuple):
405
+ # push more than one state
406
+ for state in tdef2:
407
+ assert (state in unprocessed or
408
+ state in ('#pop', '#push')), \
409
+ 'unknown new state ' + state
410
+ new_state = tdef2
411
+ else:
412
+ assert False, 'unknown new state def %r' % tdef2
413
+ tokens.append((rex, tdef[1], new_state))
414
+ return tokens
415
+
416
+ def process_tokendef(cls, name, tokendefs=None):
417
+ processed = cls._all_tokens[name] = {}
418
+ tokendefs = tokendefs or cls.tokens[name]
419
+ for state in tokendefs.keys():
420
+ cls._process_state(tokendefs, processed, state)
421
+ return processed
422
+
423
+ def __call__(cls, *args, **kwds):
424
+ if not hasattr(cls, '_tokens'):
425
+ cls._all_tokens = {}
426
+ cls._tmpname = 0
427
+ if hasattr(cls, 'token_variants') and cls.token_variants:
428
+ # don't process yet
429
+ pass
430
+ else:
431
+ cls._tokens = cls.process_tokendef('', cls.tokens)
432
+
433
+ return type.__call__(cls, *args, **kwds)
434
+
435
+
436
+ class RegexLexer(Lexer):
437
+ """
438
+ Base for simple stateful regular expression-based lexers.
439
+ Simplifies the lexing process so that you need only
440
+ provide a list of states and regular expressions.
441
+ """
442
+ __metaclass__ = RegexLexerMeta
443
+
444
+ #: Flags for compiling the regular expressions.
445
+ #: Defaults to MULTILINE.
446
+ flags = re.MULTILINE
447
+
448
+ #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
449
+ #:
450
+ #: The initial state is 'root'.
451
+ #: ``new_state`` can be omitted to signify no state transition.
452
+ #: If it is a string, the state is pushed on the stack and changed.
453
+ #: If it is a tuple of strings, all states are pushed on the stack and
454
+ #: the current state will be the topmost.
455
+ #: It can also be ``combined('state1', 'state2', ...)``
456
+ #: to signify a new, anonymous state combined from the rules of two
457
+ #: or more existing ones.
458
+ #: Furthermore, it can be '#pop' to signify going back one step in
459
+ #: the state stack, or '#push' to push the current state on the stack
460
+ #: again.
461
+ #:
462
+ #: The tuple can also be replaced with ``include('state')``, in which
463
+ #: case the rules from the state named by the string are included in the
464
+ #: current one.
465
+ tokens = {}
466
+
467
+ def get_tokens_unprocessed(self, text, stack=('root',)):
468
+ """
469
+ Split ``text`` into (tokentype, text) pairs.
470
+
471
+ ``stack`` is the inital stack (default: ``['root']``)
472
+ """
473
+ pos = 0
474
+ tokendefs = self._tokens
475
+ statestack = list(stack)
476
+ statetokens = tokendefs[statestack[-1]]
477
+ while 1:
478
+ for rexmatch, action, new_state in statetokens:
479
+ m = rexmatch(text, pos)
480
+ if m:
481
+ if type(action) is _TokenType:
482
+ yield pos, action, m.group()
483
+ else:
484
+ for item in action(self, m):
485
+ yield item
486
+ pos = m.end()
487
+ if new_state is not None:
488
+ # state transition
489
+ if isinstance(new_state, tuple):
490
+ for state in new_state:
491
+ if state == '#pop':
492
+ statestack.pop()
493
+ elif state == '#push':
494
+ statestack.append(statestack[-1])
495
+ else:
496
+ statestack.append(state)
497
+ elif isinstance(new_state, int):
498
+ # pop
499
+ del statestack[new_state:]
500
+ elif new_state == '#push':
501
+ statestack.append(statestack[-1])
502
+ else:
503
+ assert False, "wrong state def: %r" % new_state
504
+ statetokens = tokendefs[statestack[-1]]
505
+ break
506
+ else:
507
+ try:
508
+ if text[pos] == '\n':
509
+ # at EOL, reset state to "root"
510
+ pos += 1
511
+ statestack = ['root']
512
+ statetokens = tokendefs['root']
513
+ yield pos, Text, u'\n'
514
+ continue
515
+ yield pos, Error, text[pos]
516
+ pos += 1
517
+ except IndexError:
518
+ break
519
+
520
+
521
+ class LexerContext(object):
522
+ """
523
+ A helper object that holds lexer position data.
524
+ """
525
+
526
+ def __init__(self, text, pos, stack=None, end=None):
527
+ self.text = text
528
+ self.pos = pos
529
+ self.end = end or len(text) # end=0 not supported ;-)
530
+ self.stack = stack or ['root']
531
+
532
+ def __repr__(self):
533
+ return 'LexerContext(%r, %r, %r)' % (
534
+ self.text, self.pos, self.stack)
535
+
536
+
537
+ class ExtendedRegexLexer(RegexLexer):
538
+ """
539
+ A RegexLexer that uses a context object to store its state.
540
+ """
541
+
542
+ def get_tokens_unprocessed(self, text=None, context=None):
543
+ """
544
+ Split ``text`` into (tokentype, text) pairs.
545
+ If ``context`` is given, use this lexer context instead.
546
+ """
547
+ tokendefs = self._tokens
548
+ if not context:
549
+ ctx = LexerContext(text, 0)
550
+ statetokens = tokendefs['root']
551
+ else:
552
+ ctx = context
553
+ statetokens = tokendefs[ctx.stack[-1]]
554
+ text = ctx.text
555
+ while 1:
556
+ for rexmatch, action, new_state in statetokens:
557
+ m = rexmatch(text, ctx.pos, ctx.end)
558
+ if m:
559
+ if type(action) is _TokenType:
560
+ yield ctx.pos, action, m.group()
561
+ ctx.pos = m.end()
562
+ else:
563
+ for item in action(self, m, ctx):
564
+ yield item
565
+ if not new_state:
566
+ # altered the state stack?
567
+ statetokens = tokendefs[ctx.stack[-1]]
568
+ # CAUTION: callback must set ctx.pos!
569
+ if new_state is not None:
570
+ # state transition
571
+ if isinstance(new_state, tuple):
572
+ ctx.stack.extend(new_state)
573
+ elif isinstance(new_state, int):
574
+ # pop
575
+ del ctx.stack[new_state:]
576
+ elif new_state == '#push':
577
+ ctx.stack.append(ctx.stack[-1])
578
+ else:
579
+ assert False, "wrong state def: %r" % new_state
580
+ statetokens = tokendefs[ctx.stack[-1]]
581
+ break
582
+ else:
583
+ try:
584
+ if ctx.pos >= ctx.end:
585
+ break
586
+ if text[ctx.pos] == '\n':
587
+ # at EOL, reset state to "root"
588
+ ctx.pos += 1
589
+ ctx.stack = ['root']
590
+ statetokens = tokendefs['root']
591
+ yield ctx.pos, Text, u'\n'
592
+ continue
593
+ yield ctx.pos, Error, text[ctx.pos]
594
+ ctx.pos += 1
595
+ except IndexError:
596
+ break
597
+
598
+
599
+ def do_insertions(insertions, tokens):
600
+ """
601
+ Helper for lexers which must combine the results of several
602
+ sublexers.
603
+
604
+ ``insertions`` is a list of ``(index, itokens)`` pairs.
605
+ Each ``itokens`` iterable should be inserted at position
606
+ ``index`` into the token stream given by the ``tokens``
607
+ argument.
608
+
609
+ The result is a combined token stream.
610
+
611
+ TODO: clean up the code here.
612
+ """
613
+ insertions = iter(insertions)
614
+ try:
615
+ index, itokens = insertions.next()
616
+ except StopIteration:
617
+ # no insertions
618
+ for item in tokens:
619
+ yield item
620
+ return
621
+
622
+ realpos = None
623
+ insleft = True
624
+
625
+ # iterate over the token stream where we want to insert
626
+ # the tokens from the insertion list.
627
+ for i, t, v in tokens:
628
+ # first iteration. store the postition of first item
629
+ if realpos is None:
630
+ realpos = i
631
+ oldi = 0
632
+ while insleft and i + len(v) >= index:
633
+ tmpval = v[oldi:index - i]
634
+ yield realpos, t, tmpval
635
+ realpos += len(tmpval)
636
+ for it_index, it_token, it_value in itokens:
637
+ yield realpos, it_token, it_value
638
+ realpos += len(it_value)
639
+ oldi = index - i
640
+ try:
641
+ index, itokens = insertions.next()
642
+ except StopIteration:
643
+ insleft = False
644
+ break # not strictly necessary
645
+ yield realpos, t, v[oldi:]
646
+ realpos += len(v) - oldi
647
+
648
+ # leftover tokens
649
+ while insleft:
650
+ # no normal tokens, set realpos to zero
651
+ realpos = realpos or 0
652
+ for p, t, v in itokens:
653
+ yield realpos, t, v
654
+ realpos += len(v)
655
+ try:
656
+ index, itokens = insertions.next()
657
+ except StopIteration:
658
+ insleft = False
659
+ break # not strictly necessary
660
+