pygments.rb 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,6 +1,54 @@
1
1
  # pygments.rb
2
2
 
3
- a ruby wrapper for the pygments syntax highlighter via embedded python.
3
+ A ruby wrapper for the python [pygments syntax highlighter](http://pygments.org/).
4
+
5
+ This library replaces [github/albino](https://github.com/github/albino).
6
+ Instead of shelling out to `pygmentize`, it embeds the python
7
+ interpreter inside ruby via FFI. This avoids the cost of setting up the
8
+ python VM on every invocation and speeds up code highlighting from ruby by 10-15x.
9
+
10
+ ## usage
11
+
12
+ ``` ruby
13
+ Pygments.highlight(File.read(__FILE__), :lexer => 'ruby')
14
+ ```
15
+
16
+ Encoding and other lexer/formatter options can be passed in via an
17
+ options hash:
18
+
19
+ ``` ruby
20
+ Pygments.highlight('code', :options => {:encoding => 'utf-8'})
21
+ ```
22
+
23
+ To use a formatter other than html, specify it explicitly:
24
+
25
+ ``` ruby
26
+ Pygments.highlight('code', :formatter => 'bbcode')
27
+ Pygments.highlight('code', :formatter => 'terminal')
28
+ ```
29
+
30
+ To generate CSS for html formatted code, use the css method:
31
+
32
+ ``` ruby
33
+ Pygments.css
34
+ Pygments.css('.highlight')
35
+ ```
36
+
37
+ To use a custom python installation (like in ArchLinux), tell
38
+ RubyPython where python lives:
39
+
40
+ ``` ruby
41
+ RubyPython.configure :python_exe => 'python2.7'
42
+ ```
43
+
44
+ To use a custom pygments installation, specify the path to
45
+ Pygments.start:
46
+
47
+ ``` ruby
48
+ Pygments.start("/path/to/pygments")
49
+ ```
50
+
51
+ ## benchmarks
4
52
 
5
53
  $ ruby -rubygems bench.rb 50
6
54
  user system total real
@@ -9,3 +57,5 @@ a ruby wrapper for the pygments syntax highlighter via embedded python.
9
57
  pygments::ffi + reload 11.350000 1.240000 12.590000 ( 12.692320)
10
58
  pygments::ffi 1.130000 0.010000 1.140000 ( 1.171589)
11
59
 
60
+ To run `bench.rb`, use a git checkout. The C extension is not included
61
+ in gem releases.
data/ext/extconf.rb CHANGED
@@ -9,6 +9,6 @@ $CFLAGS << " -Wall "
9
9
  unless python
10
10
  $stderr.puts '*** could not find libpython or Python.h'
11
11
  else
12
- $CFLAGS << " -I/usr/include/python#{python} "
12
+ $defs << "-DPYGMENTS_PYTHON_VERSION=#{python.gsub('.','')}"
13
13
  create_makefile('pygments_ext')
14
14
  end
data/ext/pygments.c CHANGED
@@ -2,7 +2,18 @@
2
2
  #include <stdlib.h>
3
3
 
4
4
  #include <ruby.h>
5
- #include <Python.h>
5
+
6
+ #if PYGMENTS_PYTHON_VERSION == 24
7
+ #include <python2.4/Python.h>
8
+ #elif PYGMENTS_PYTHON_VERSION == 25
9
+ #include <python2.5/Python.h>
10
+ #elif PYGMENTS_PYTHON_VERSION == 26
11
+ #include <python2.6/Python.h>
12
+ #elif PYGMENTS_PYTHON_VERSION == 27
13
+ #include <python2.7/Python.h>
14
+ #else
15
+ #error Unknown python version
16
+ #endif
6
17
 
7
18
  #ifdef RUBY_VM
8
19
  #include <ruby/st.h>
data/lib/pygments/ffi.rb CHANGED
@@ -6,6 +6,7 @@ module Pygments
6
6
 
7
7
  def start(pygments_path = File.expand_path('../../../vendor/pygments-main/', __FILE__))
8
8
  RubyPython.start
9
+ RubyPython.import('pkg_resources') rescue nil
9
10
  sys = RubyPython.import('sys')
10
11
  sys.path.insert(0, pygments_path)
11
12
 
@@ -65,7 +65,7 @@ module Pygments
65
65
  #
66
66
  # Returns the Lexer or nil if none was found.
67
67
  def self.find(name)
68
- @index[name.downcase]
68
+ @index[name.to_s.downcase]
69
69
  end
70
70
 
71
71
  # Public: Alias for find.
@@ -1,3 +1,3 @@
1
1
  module Pygments
2
- VERSION = '0.2.1'
2
+ VERSION = '0.2.2'
3
3
  end
@@ -39,6 +39,7 @@ Other contributors, listed alphabetically, are:
39
39
  * Matthew Harrison -- SVG formatter
40
40
  * Steven Hazel -- Tcl lexer
41
41
  * Aslak Hellesøy -- Gherkin lexer
42
+ * Jordi Gutiérrez Hermoso -- Octave lexer
42
43
  * David Hess, Fish Software, Inc. -- Objective-J lexer
43
44
  * Varun Hiremath -- Debian control lexer
44
45
  * Ben Hollis -- Mason lexer
@@ -78,6 +79,7 @@ Other contributors, listed alphabetically, are:
78
79
  * Ken Schutte -- Matlab lexers
79
80
  * Tassilo Schweyer -- Io, MOOCode lexers
80
81
  * Joerg Sieker -- ABAP lexer
82
+ * Robert Simmons -- Standard ML lexer
81
83
  * Kirill Simonov -- YAML lexer
82
84
  * Steve Spigarelli -- XQuery lexer
83
85
  * Jerome St-Louis -- eC lexer
@@ -90,6 +92,7 @@ Other contributors, listed alphabetically, are:
90
92
  * Dietmar Winkler -- Modelica lexer
91
93
  * Nils Winter -- Smalltalk lexer
92
94
  * Davy Wybiral -- Clojure lexer
95
+ * Diego Zamboni -- CFengine3 lexer
93
96
  * Alex Zimin -- Nemerle lexer
94
97
 
95
98
  Many thanks for all contributions!
@@ -21,6 +21,9 @@ Version 1.5
21
21
  * PostgreSQL (#660)
22
22
  * DTD
23
23
  * Gosu
24
+ * Octave (PR#22)
25
+ * Standard ML (PR#14)
26
+ * CFengine3 (#601)
24
27
 
25
28
  - In the LaTeX formatter, escape special &, < and > chars (#648).
26
29
 
@@ -41,6 +44,8 @@ Version 1.5
41
44
 
42
45
  - Fix generic type highlighting in ActionScript 3 (#666).
43
46
 
47
+ - Fixes to the Clojure lexer (PR#9).
48
+
44
49
 
45
50
  Version 1.4
46
51
  -----------
@@ -1 +1 @@
1
- 456992e7ff81
1
+ db34feabe4b8
@@ -121,7 +121,7 @@ sections, comments and key = value pairs:
121
121
  }
122
122
 
123
123
  The lexer first looks for whitespace, comments and section names. And later it
124
- looks for a line that looks like a key, value pair, seperated by an ``'='``
124
+ looks for a line that looks like a key, value pair, separated by an ``'='``
125
125
  sign, and optional whitespace.
126
126
 
127
127
  The `bygroups` helper makes sure that each group is yielded with a different
@@ -85,7 +85,7 @@ Here a small overview of all allowed styles:
85
85
  ``bold``
86
86
  render text as bold
87
87
  ``nobold``
88
- don't render text as bold (to prevent subtokens behing highlighted bold)
88
+ don't render text as bold (to prevent subtokens being highlighted bold)
89
89
  ``italic``
90
90
  render text italic
91
91
  ``noitalic``
@@ -219,7 +219,7 @@ def main(args=sys.argv):
219
219
  return 0
220
220
 
221
221
  if opts.pop('-V', None) is not None:
222
- print 'Pygments version %s, (c) 2006-2008 by Georg Brandl.' % __version__
222
+ print 'Pygments version %s, (c) 2006-2011 by Georg Brandl.' % __version__
223
223
  return 0
224
224
 
225
225
  # handle ``pygmentize -L``
@@ -286,7 +286,7 @@ class LatexFormatter(Formatter):
286
286
  cp = self.commandprefix
287
287
  styles = []
288
288
  for name, definition in self.cmd2def.iteritems():
289
- styles.append(r'\def\%s@tok@%s{%s}' % (cp, name, definition))
289
+ styles.append(r'\expandafter\def\csname %s@tok@%s\endcsname{%s}' % (cp, name, definition))
290
290
  return STYLE_TEMPLATE % {'cp': self.commandprefix,
291
291
  'styles': '\n'.join(styles)}
292
292
 
@@ -46,6 +46,7 @@ LEXERS = {
46
46
  'CObjdumpLexer': ('pygments.lexers.asm', 'c-objdump', ('c-objdump',), ('*.c-objdump',), ('text/x-c-objdump',)),
47
47
  'CSharpAspxLexer': ('pygments.lexers.dotnet', 'aspx-cs', ('aspx-cs',), ('*.aspx', '*.asax', '*.ascx', '*.ashx', '*.asmx', '*.axd'), ()),
48
48
  'CSharpLexer': ('pygments.lexers.dotnet', 'C#', ('csharp', 'c#'), ('*.cs',), ('text/x-csharp',)),
49
+ 'Cfengine3Lexer': ('pygments.lexers.other', 'CFEngine3', ('cfengine3', 'cf3'), ('*.cf',), ()),
49
50
  'CheetahHtmlLexer': ('pygments.lexers.templates', 'HTML+Cheetah', ('html+cheetah', 'html+spitfire'), (), ('text/html+cheetah', 'text/html+spitfire')),
50
51
  'CheetahJavascriptLexer': ('pygments.lexers.templates', 'JavaScript+Cheetah', ('js+cheetah', 'javascript+cheetah', 'js+spitfire', 'javascript+spitfire'), (), ('application/x-javascript+cheetah', 'text/x-javascript+cheetah', 'text/javascript+cheetah', 'application/x-javascript+spitfire', 'text/x-javascript+spitfire', 'text/javascript+spitfire')),
51
52
  'CheetahLexer': ('pygments.lexers.templates', 'Cheetah', ('cheetah', 'spitfire'), ('*.tmpl', '*.spt'), ('application/x-cheetah', 'application/x-spitfire')),
@@ -135,7 +136,7 @@ LEXERS = {
135
136
  'MakoXmlLexer': ('pygments.lexers.templates', 'XML+Mako', ('xml+mako',), (), ('application/xml+mako',)),
136
137
  'MaqlLexer': ('pygments.lexers.other', 'MAQL', ('maql',), ('*.maql',), ('text/x-gooddata-maql', 'application/x-gooddata-maql')),
137
138
  'MasonLexer': ('pygments.lexers.templates', 'Mason', ('mason',), ('*.m', '*.mhtml', '*.mc', '*.mi', 'autohandler', 'dhandler'), ('application/x-mason',)),
138
- 'MatlabLexer': ('pygments.lexers.math', 'Matlab', ('matlab', 'octave'), ('*.m',), ('text/matlab',)),
139
+ 'MatlabLexer': ('pygments.lexers.math', 'Matlab', ('matlab',), ('*.m',), ('text/matlab',)),
139
140
  'MatlabSessionLexer': ('pygments.lexers.math', 'Matlab session', ('matlabsession',), (), ()),
140
141
  'MiniDLexer': ('pygments.lexers.agile', 'MiniD', ('minid',), ('*.md',), ('text/x-minidsrc',)),
141
142
  'ModelicaLexer': ('pygments.lexers.other', 'Modelica', ('modelica',), ('*.mo',), ('text/x-modelica',)),
@@ -160,6 +161,7 @@ LEXERS = {
160
161
  'ObjectiveJLexer': ('pygments.lexers.web', 'Objective-J', ('objective-j', 'objectivej', 'obj-j', 'objj'), ('*.j',), ('text/x-objective-j',)),
161
162
  'OcamlLexer': ('pygments.lexers.compiled', 'OCaml', ('ocaml',), ('*.ml', '*.mli', '*.mll', '*.mly'), ('text/x-ocaml',)),
162
163
  'OcamlLexer': ('pygments.lexers.functional', 'OCaml', ('ocaml',), ('*.ml', '*.mli', '*.mll', '*.mly'), ('text/x-ocaml',)),
164
+ 'OctaveLexer': ('pygments.lexers.math', 'Octave', ('octave',), ('*.m',), ('text/octave',)),
163
165
  'OocLexer': ('pygments.lexers.compiled', 'Ooc', ('ooc',), ('*.ooc',), ('text/x-ooc',)),
164
166
  'PerlLexer': ('pygments.lexers.agile', 'Perl', ('perl', 'pl'), ('*.pl', '*.pm'), ('text/x-perl', 'application/x-perl')),
165
167
  'PhpLexer': ('pygments.lexers.web', 'PHP', ('php', 'php3', 'php4', 'php5'), ('*.php', '*.php[345]'), ('text/x-php',)),
@@ -194,6 +196,7 @@ LEXERS = {
194
196
  'RubyConsoleLexer': ('pygments.lexers.agile', 'Ruby irb session', ('rbcon', 'irb'), (), ('text/x-ruby-shellsession',)),
195
197
  'RubyLexer': ('pygments.lexers.agile', 'Ruby', ('rb', 'ruby', 'duby'), ('*.rb', '*.rbw', 'Rakefile', '*.rake', '*.gemspec', '*.rbx', '*.duby'), ('text/x-ruby', 'application/x-ruby')),
196
198
  'SLexer': ('pygments.lexers.math', 'S', ('splus', 's', 'r'), ('*.S', '*.R'), ('text/S-plus', 'text/S', 'text/R')),
199
+ 'SMLLexer': ('pygments.lexers.functional', 'Standard ML', ('sml',), ('*.sml', '*.sig', '*.fun'), ('text/x-standardml', 'application/x-standardml')),
197
200
  'SassLexer': ('pygments.lexers.web', 'Sass', ('sass', 'SASS'), ('*.sass',), ('text/x-sass',)),
198
201
  'ScalaLexer': ('pygments.lexers.compiled', 'Scala', ('scala',), ('*.scala',), ('text/x-scala',)),
199
202
  'ScamlLexer': ('pygments.lexers.web', 'Scaml', ('scaml', 'SCAML'), ('*.scaml',), ('text/x-scaml',)),
@@ -13,7 +13,7 @@ import re
13
13
 
14
14
  from pygments.lexer import Lexer, RegexLexer, ExtendedRegexLexer, \
15
15
  LexerContext, include, combined, do_insertions, bygroups, using, this
16
- from pygments.token import Error, Text, Other, \
16
+ from pygments.token import Error, Text, Whitespace, Other, \
17
17
  Comment, Operator, Keyword, Name, String, Number, Generic, Punctuation
18
18
  from pygments.util import get_bool_opt, get_list_opt, shebang_matches
19
19
  from pygments import unistring as uni
@@ -1367,13 +1367,11 @@ class ClojureLexer(RegexLexer):
1367
1367
 
1368
1368
  keywords = [
1369
1369
  'fn', 'def', 'defn', 'defmacro', 'defmethod', 'defmulti', 'defn-',
1370
- 'defstruct',
1371
- 'if', 'cond',
1372
- 'let', 'for'
1370
+ 'defstruct', 'if', 'cond', 'let', 'for'
1373
1371
  ]
1374
1372
  builtins = [
1375
1373
  '.', '..',
1376
- '*', '+', '-', '->', '..', '/', '<', '<=', '=', '==', '>', '>=',
1374
+ '*', '+', '-', '->', '/', '<', '<=', '=', '==', '>', '>=',
1377
1375
  'accessor', 'agent', 'agent-errors', 'aget', 'alength', 'all-ns',
1378
1376
  'alter', 'and', 'append-child', 'apply', 'array-map', 'aset',
1379
1377
  'aset-boolean', 'aset-byte', 'aset-char', 'aset-double', 'aset-float',
@@ -1389,13 +1387,13 @@ class ClojureLexer(RegexLexer):
1389
1387
  'double', 'down', 'drop', 'drop-while', 'edit', 'end?', 'ensure',
1390
1388
  'eval', 'every?', 'false?', 'ffirst', 'file-seq', 'filter', 'find',
1391
1389
  'find-doc', 'find-ns', 'find-var', 'first', 'float', 'flush',
1392
- 'fnseq', 'frest', 'gensym', 'get', 'get-proxy-class',
1390
+ 'fnseq', 'frest', 'gensym', 'get-proxy-class', 'get',
1393
1391
  'hash-map', 'hash-set', 'identical?', 'identity', 'if-let', 'import',
1394
1392
  'in-ns', 'inc', 'index', 'insert-child', 'insert-left', 'insert-right',
1395
1393
  'inspect-table', 'inspect-tree', 'instance?', 'int', 'interleave',
1396
1394
  'intersection', 'into', 'into-array', 'iterate', 'join', 'key', 'keys',
1397
1395
  'keyword', 'keyword?', 'last', 'lazy-cat', 'lazy-cons', 'left',
1398
- 'lefts', 'line-seq', 'list', 'list*', 'load', 'load-file',
1396
+ 'lefts', 'line-seq', 'list*', 'list', 'load', 'load-file',
1399
1397
  'locking', 'long', 'loop', 'macroexpand', 'macroexpand-1',
1400
1398
  'make-array', 'make-node', 'map', 'map-invert', 'map?', 'mapcat',
1401
1399
  'max', 'max-key', 'memfn', 'merge', 'merge-with', 'meta', 'min',
@@ -1426,7 +1424,14 @@ class ClojureLexer(RegexLexer):
1426
1424
  # valid names for identifiers
1427
1425
  # well, names can only not consist fully of numbers
1428
1426
  # but this should be good enough for now
1429
- valid_name = r'[a-zA-Z0-9!$%&*+,/:<=>?@^_~-]+'
1427
+
1428
+ # TODO / should divide keywords/symbols into namespace/rest
1429
+ # but that's hard, so just pretend / is part of the name
1430
+ valid_name = r'[\w!$%*+,<=>?/.-]+'
1431
+
1432
+ def _multi_escape(entries):
1433
+ return '|'.join([re.escape(entry) + '(?![\\w-!$%*+,<=>?/.-])'
1434
+ for entry in entries])
1430
1435
 
1431
1436
  tokens = {
1432
1437
  'root' : [
@@ -1435,42 +1440,29 @@ class ClojureLexer(RegexLexer):
1435
1440
  (r';.*$', Comment.Single),
1436
1441
 
1437
1442
  # whitespaces - usually not relevant
1438
- (r'\s+', Text),
1443
+ (r'[,\s]+', Whitespace),
1439
1444
 
1440
1445
  # numbers
1441
1446
  (r'-?\d+\.\d+', Number.Float),
1442
1447
  (r'-?\d+', Number.Integer),
1443
- # support for uncommon kinds of numbers -
1444
- # have to figure out what the characters mean
1445
- #(r'(#e|#i|#b|#o|#d|#x)[\d.]+', Number),
1448
+ (r'0x-?[abcdef\d]+', Number.Hex),
1446
1449
 
1447
1450
  # strings, symbols and characters
1448
1451
  (r'"(\\\\|\\"|[^"])*"', String),
1449
1452
  (r"'" + valid_name, String.Symbol),
1450
- (r"\\([()/'\".'_!§$%& ?;=#+-]{1}|[a-zA-Z0-9]+)", String.Char),
1453
+ (r"\\(.|[a-z]+)", String.Char),
1451
1454
 
1452
- # constants
1453
- (r'(#t|#f)', Name.Constant),
1455
+ # keywords
1456
+ (r':' + valid_name, Name.Constant),
1454
1457
 
1455
1458
  # special operators
1456
- (r"('|#|`|,@|,|\.)", Operator),
1459
+ (r'~@|[`\'#^~&]', Operator),
1457
1460
 
1458
1461
  # highlight the keywords
1459
- ('(%s)' % '|'.join([
1460
- re.escape(entry) + ' ' for entry in keywords]),
1461
- Keyword
1462
- ),
1463
-
1464
- # first variable in a quoted string like
1465
- # '(this is syntactic sugar)
1466
- (r"(?<='\()" + valid_name, Name.Variable),
1467
- (r"(?<=#\()" + valid_name, Name.Variable),
1462
+ (_multi_escape(keywords), Keyword),
1468
1463
 
1469
1464
  # highlight the builtins
1470
- ("(?<=\()(%s)" % '|'.join([
1471
- re.escape(entry) + ' ' for entry in builtins]),
1472
- Name.Builtin
1473
- ),
1465
+ (_multi_escape(builtins), Name.Builtin),
1474
1466
 
1475
1467
  # the remaining functions
1476
1468
  (r'(?<=\()' + valid_name, Name.Function),
@@ -13,11 +13,12 @@ import re
13
13
 
14
14
  from pygments.lexer import Lexer, RegexLexer, bygroups, include, do_insertions
15
15
  from pygments.token import Text, Comment, Operator, Keyword, Name, \
16
- String, Number, Punctuation, Literal, Generic
16
+ String, Number, Punctuation, Literal, Generic, Error
17
17
 
18
18
 
19
- __all__ = ['SchemeLexer', 'CommonLispLexer', 'HaskellLexer', 'LiterateHaskellLexer',
20
- 'OcamlLexer', 'ErlangLexer', 'ErlangShellLexer']
19
+ __all__ = ['SchemeLexer', 'CommonLispLexer', 'HaskellLexer',
20
+ 'LiterateHaskellLexer', 'SMLLexer', 'OcamlLexer', 'ErlangLexer',
21
+ 'ErlangShellLexer']
21
22
 
22
23
 
23
24
  class SchemeLexer(RegexLexer):
@@ -515,6 +516,329 @@ class LiterateHaskellLexer(Lexer):
515
516
  yield item
516
517
 
517
518
 
519
+ class SMLLexer(RegexLexer):
520
+ """
521
+ For the Standard ML language.
522
+
523
+ *New in Pygments 1.5.*
524
+ """
525
+
526
+ name = 'Standard ML'
527
+ aliases = ['sml']
528
+ filenames = ['*.sml', '*.sig', '*.fun',]
529
+ mimetypes = ['text/x-standardml', 'application/x-standardml']
530
+
531
+ alphanumid_reserved = [
532
+ # Core
533
+ 'abstype', 'and', 'andalso', 'as', 'case', 'datatype', 'do', 'else',
534
+ 'end', 'exception', 'fn', 'fun', 'handle', 'if', 'in', 'infix',
535
+ 'infixr', 'let', 'local', 'nonfix', 'of', 'op', 'open', 'orelse',
536
+ 'raise', 'rec', 'then', 'type', 'val', 'with', 'withtype', 'while',
537
+ # Modules
538
+ 'eqtype', 'functor', 'include', 'sharing', 'sig', 'signature',
539
+ 'struct', 'structure', 'where',
540
+ ]
541
+
542
+ symbolicid_reserved = [
543
+ # Core
544
+ ':', '\|', '=', '=>', '->', '#',
545
+ # Modules
546
+ ':>',
547
+ ]
548
+
549
+ nonid_reserved = [ '(', ')', '[', ']', '{', '}', ',', ';', '...', '_' ]
550
+
551
+ alphanumid_re = r"[a-zA-Z][a-zA-Z0-9_']*"
552
+ symbolicid_re = r"[!%&$#+\-/:<=>?@\\~`^|*]+"
553
+
554
+ # A character constant is a sequence of the form #s, where s is a string
555
+ # constant denoting a string of size one character. This setup just parses
556
+ # the entire string as either a String.Double or a String.Char (depending
557
+ # on the argument), even if the String.Char is an erronous
558
+ # multiple-character string.
559
+ def stringy (whatkind):
560
+ return [
561
+ (r'[^"\\]', whatkind),
562
+ (r'\\[\\\"abtnvfr]', String.Escape),
563
+ (r'\\\^[@-^]', String.Escape),
564
+ (r'\\[0-9]{3}', String.Escape),
565
+ (r'\\u[0-9a-fA-F]{4}', String.Escape),
566
+ (r'\\\s+\\', String.Interpol),
567
+ (r'"', whatkind, '#pop'),
568
+ ]
569
+
570
+ # Callbacks for distinguishing tokens and reserved words
571
+ def long_id_callback(self, match):
572
+ if match.group(1) in self.alphanumid_reserved: token = Error
573
+ else: token = Name.Namespace
574
+ yield match.start(1), token, match.group(1)
575
+ yield match.start(2), Punctuation, match.group(2)
576
+
577
+ def end_id_callback(self, match):
578
+ if match.group(1) in self.alphanumid_reserved: token = Error
579
+ elif match.group(1) in self.symbolicid_reserved: token = Error
580
+ else: token = Name
581
+ yield match.start(1), token, match.group(1)
582
+
583
+ def id_callback(self, match):
584
+ str = match.group(1)
585
+ if str in self.alphanumid_reserved: token = Keyword.Reserved
586
+ elif str in self.symbolicid_reserved: token = Punctuation
587
+ else: token = Name
588
+ yield match.start(1), token, str
589
+
590
+ tokens = {
591
+ # Whitespace and comments are (almost) everywhere
592
+ 'whitespace': [
593
+ (r'\s+', Text),
594
+ (r'\(\*', Comment.Multiline, 'comment'),
595
+ ],
596
+
597
+ 'delimiters': [
598
+ # This lexer treats these delimiters specially:
599
+ # Delimiters define scopes, and the scope is how the meaning of
600
+ # the `|' is resolved - is it a case/handle expression, or function
601
+ # definition by cases? (This is not how the Definition works, but
602
+ # it's how MLton behaves, see http://mlton.org/SMLNJDeviations)
603
+ (r'\(|\[|{', Punctuation, 'main'),
604
+ (r'\)|\]|}', Punctuation, '#pop'),
605
+ (r'\b(let|if|local)\b(?!\')', Keyword.Reserved, ('main', 'main')),
606
+ (r'\b(struct|sig|while)\b(?!\')', Keyword.Reserved, 'main'),
607
+ (r'\b(do|else|end|in|then)\b(?!\')', Keyword.Reserved, '#pop'),
608
+ ],
609
+
610
+ 'core': [
611
+ # Punctuation that doesn't overlap symbolic identifiers
612
+ (r'(%s)' % '|'.join([re.escape(z) for z in nonid_reserved]),
613
+ Punctuation),
614
+
615
+ # Special constants: strings, floats, numbers in decimal and hex
616
+ (r'#"', String.Char, 'char'),
617
+ (r'"', String.Double, 'string'),
618
+ (r'~?0x[0-9a-fA-F]+', Number.Hex),
619
+ (r'0wx[0-9a-fA-F]+', Number.Hex),
620
+ (r'0w\d+', Number.Integer),
621
+ (r'~?\d+\.\d+[eE]~?\d+', Number.Float),
622
+ (r'~?\d+\.\d+', Number.Float),
623
+ (r'~?\d+[eE]~?\d+', Number.Float),
624
+ (r'~?\d+', Number.Integer),
625
+
626
+ # Labels
627
+ (r'#\s*[1-9][0-9]*', Name.Label),
628
+ (r'#\s*(%s)' % alphanumid_re, Name.Label),
629
+ (r'#\s+(%s)' % symbolicid_re, Name.Label),
630
+ # Some reserved words trigger a special, local lexer state change
631
+ (r'\b(datatype|abstype)\b(?!\')', Keyword.Reserved, 'dname'),
632
+ (r'(?=\b(exception)\b(?!\'))', Text, ('ename')),
633
+ (r'\b(functor|include|open|signature|structure)\b(?!\')',
634
+ Keyword.Reserved, 'sname'),
635
+ (r'\b(type|eqtype)\b(?!\')', Keyword.Reserved, 'tname'),
636
+
637
+ # Regular identifiers, long and otherwise
638
+ (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
639
+ (r'(%s)(\.)' % alphanumid_re, long_id_callback, "dotted"),
640
+ (r'(%s)' % alphanumid_re, id_callback),
641
+ (r'(%s)' % symbolicid_re, id_callback),
642
+ ],
643
+ 'dotted': [
644
+ (r'(%s)(\.)' % alphanumid_re, long_id_callback),
645
+ (r'(%s)' % alphanumid_re, end_id_callback, "#pop"),
646
+ (r'(%s)' % symbolicid_re, end_id_callback, "#pop"),
647
+ (r'\s+', Error),
648
+ (r'\S+', Error),
649
+ ],
650
+
651
+
652
+ # Main parser (prevents errors in files that have scoping errors)
653
+ 'root': [ (r'', Text, 'main') ],
654
+
655
+ # In this scope, I expect '|' to not be followed by a function name,
656
+ # and I expect 'and' to be followed by a binding site
657
+ 'main': [
658
+ include('whitespace'),
659
+
660
+ # Special behavior of val/and/fun
661
+ (r'\b(val|and)\b(?!\')', Keyword.Reserved, 'vname'),
662
+ (r'\b(fun)\b(?!\')', Keyword.Reserved,
663
+ ('#pop', 'main-fun', 'fname')),
664
+
665
+ include('delimiters'),
666
+ include('core'),
667
+ (r'\S+', Error),
668
+ ],
669
+
670
+ # In this scope, I expect '|' and 'and' to be followed by a function
671
+ 'main-fun': [
672
+ include('whitespace'),
673
+
674
+ (r'\s', Text),
675
+ (r'\(\*', Comment.Multiline, 'comment'),
676
+
677
+ # Special behavior of val/and/fun
678
+ (r'\b(fun|and)\b(?!\')', Keyword.Reserved, 'fname'),
679
+ (r'\b(val)\b(?!\')', Keyword.Reserved,
680
+ ('#pop', 'main', 'vname')),
681
+
682
+ # Special behavior of '|' and '|'-manipulating keywords
683
+ (r'\|', Punctuation, 'fname'),
684
+ (r'\b(case|handle)\b(?!\')', Keyword.Reserved,
685
+ ('#pop', 'main')),
686
+
687
+ include('delimiters'),
688
+ include('core'),
689
+ (r'\S+', Error),
690
+ ],
691
+
692
+ # Character and string parsers
693
+ 'char': stringy(String.Char),
694
+ 'string': stringy(String.Double),
695
+
696
+ 'breakout': [
697
+ (r'(?=\b(%s)\b(?!\'))' % '|'.join(alphanumid_reserved), Text, '#pop'),
698
+ ],
699
+
700
+ # Dealing with what comes after module system keywords
701
+ 'sname': [
702
+ include('whitespace'),
703
+ include('breakout'),
704
+
705
+ (r'(%s)' % alphanumid_re, Name.Namespace),
706
+ (r'', Text, '#pop'),
707
+ ],
708
+
709
+ # Dealing with what comes after the 'fun' (or 'and' or '|') keyword
710
+ 'fname': [
711
+ include('whitespace'),
712
+ (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
713
+ (r'\(', Punctuation, 'tyvarseq'),
714
+
715
+ (r'(%s)' % alphanumid_re, Name.Function, '#pop'),
716
+ (r'(%s)' % symbolicid_re, Name.Function, '#pop'),
717
+
718
+ # Ignore interesting function declarations like "fun (x + y) = ..."
719
+ (r'', Text, '#pop'),
720
+ ],
721
+
722
+ # Dealing with what comes after the 'val' (or 'and') keyword
723
+ 'vname': [
724
+ include('whitespace'),
725
+ (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
726
+ (r'\(', Punctuation, 'tyvarseq'),
727
+
728
+ (r'(%s)(\s*)(=(?!%s))' % (alphanumid_re, symbolicid_re),
729
+ bygroups(Name.Variable, Text, Punctuation), '#pop'),
730
+ (r'(%s)(\s*)(=(?!%s))' % (symbolicid_re, symbolicid_re),
731
+ bygroups(Name.Variable, Text, Punctuation), '#pop'),
732
+ (r'(%s)' % alphanumid_re, Name.Variable, '#pop'),
733
+ (r'(%s)' % symbolicid_re, Name.Variable, '#pop'),
734
+
735
+ # Ignore interesting patterns like 'val (x, y)'
736
+ (r'', Text, '#pop'),
737
+ ],
738
+
739
+ # Dealing with what comes after the 'type' (or 'and') keyword
740
+ 'tname': [
741
+ include('whitespace'),
742
+ include('breakout'),
743
+
744
+ (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
745
+ (r'\(', Punctuation, 'tyvarseq'),
746
+ (r'=(?!%s)' % symbolicid_re, Punctuation, ('#pop', 'typbind')),
747
+
748
+ (r'(%s)' % alphanumid_re, Keyword.Type),
749
+ (r'(%s)' % symbolicid_re, Keyword.Type),
750
+ (r'\S+', Error, '#pop'),
751
+ ],
752
+
753
+ # A type binding includes most identifiers
754
+ 'typbind': [
755
+ include('whitespace'),
756
+
757
+ (r'\b(and)\b(?!\')', Keyword.Reserved, ('#pop', 'tname')),
758
+
759
+ include('breakout'),
760
+ include('core'),
761
+ (r'\S+', Error, '#pop'),
762
+ ],
763
+
764
+ # Dealing with what comes after the 'datatype' (or 'and') keyword
765
+ 'dname': [
766
+ include('whitespace'),
767
+ include('breakout'),
768
+
769
+ (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
770
+ (r'\(', Punctuation, 'tyvarseq'),
771
+ (r'(=)(\s*)(datatype)',
772
+ bygroups(Punctuation, Text, Keyword.Reserved), '#pop'),
773
+ (r'=(?!%s)' % symbolicid_re, Punctuation,
774
+ ('#pop', 'datbind', 'datcon')),
775
+
776
+ (r'(%s)' % alphanumid_re, Keyword.Type),
777
+ (r'(%s)' % symbolicid_re, Keyword.Type),
778
+ (r'\S+', Error, '#pop'),
779
+ ],
780
+
781
+ # common case - A | B | C of int
782
+ 'datbind': [
783
+ include('whitespace'),
784
+
785
+ (r'\b(and)\b(?!\')', Keyword.Reserved, ('#pop', 'dname')),
786
+ (r'\b(withtype)\b(?!\')', Keyword.Reserved, ('#pop', 'tname')),
787
+ (r'\b(of)\b(?!\')', Keyword.Reserved),
788
+
789
+ (r'(\|)(\s*)(%s)' % alphanumid_re,
790
+ bygroups(Punctuation, Text, Name.Class)),
791
+ (r'(\|)(\s+)(%s)' % symbolicid_re,
792
+ bygroups(Punctuation, Text, Name.Class)),
793
+
794
+ include('breakout'),
795
+ include('core'),
796
+ (r'\S+', Error),
797
+ ],
798
+
799
+ # Dealing with what comes after an exception
800
+ 'ename': [
801
+ include('whitespace'),
802
+
803
+ (r'(exception|and)\b(\s+)(%s)' % alphanumid_re,
804
+ bygroups(Keyword.Reserved, Text, Name.Class)),
805
+ (r'(exception|and)\b(\s*)(%s)' % symbolicid_re,
806
+ bygroups(Keyword.Reserved, Text, Name.Class)),
807
+ (r'\b(of)\b(?!\')', Keyword.Reserved),
808
+
809
+ include('breakout'),
810
+ include('core'),
811
+ (r'\S+', Error),
812
+ ],
813
+
814
+ 'datcon': [
815
+ include('whitespace'),
816
+ (r'(%s)' % alphanumid_re, Name.Class, '#pop'),
817
+ (r'(%s)' % symbolicid_re, Name.Class, '#pop'),
818
+ (r'\S+', Error, '#pop'),
819
+ ],
820
+
821
+ # Series of type variables
822
+ 'tyvarseq': [
823
+ (r'\s', Text),
824
+ (r'\(\*', Comment.Multiline, 'comment'),
825
+
826
+ (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
827
+ (alphanumid_re, Name),
828
+ (r',', Punctuation),
829
+ (r'\)', Punctuation, '#pop'),
830
+ (symbolicid_re, Name),
831
+ ],
832
+
833
+ 'comment': [
834
+ (r'[^(*)]', Comment.Multiline),
835
+ (r'\(\*', Comment.Multiline, '#push'),
836
+ (r'\*\)', Comment.Multiline, '#pop'),
837
+ (r'[(*)]', Comment.Multiline),
838
+ ],
839
+ }
840
+
841
+
518
842
  class OcamlLexer(RegexLexer):
519
843
  """
520
844
  For the OCaml language.