pygments.rb 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,6 +1,54 @@
1
1
  # pygments.rb
2
2
 
3
- a ruby wrapper for the pygments syntax highlighter via embedded python.
3
+ A ruby wrapper for the python [pygments syntax highlighter](http://pygments.org/).
4
+
5
+ This library replaces [github/albino](https://github.com/github/albino).
6
+ Instead of shelling out to `pygmentize`, it embeds the python
7
+ interpreter inside ruby via FFI. This avoids the cost of setting up the
8
+ python VM on every invocation and speeds up code highlighting from ruby by 10-15x.
9
+
10
+ ## usage
11
+
12
+ ``` ruby
13
+ Pygments.highlight(File.read(__FILE__), :lexer => 'ruby')
14
+ ```
15
+
16
+ Encoding and other lexer/formatter options can be passed in via an
17
+ options hash:
18
+
19
+ ``` ruby
20
+ Pygments.highlight('code', :options => {:encoding => 'utf-8'})
21
+ ```
22
+
23
+ To use a formatter other than html, specify it explicitly:
24
+
25
+ ``` ruby
26
+ Pygments.highlight('code', :formatter => 'bbcode')
27
+ Pygments.highlight('code', :formatter => 'terminal')
28
+ ```
29
+
30
+ To generate CSS for html formatted code, use the css method:
31
+
32
+ ``` ruby
33
+ Pygments.css
34
+ Pygments.css('.highlight')
35
+ ```
36
+
37
+ To use a custom python installation (like in ArchLinux), tell
38
+ RubyPython where python lives:
39
+
40
+ ``` ruby
41
+ RubyPython.configure :python_exe => 'python2.7'
42
+ ```
43
+
44
+ To use a custom pygments installation, specify the path to
45
+ Pygments.start:
46
+
47
+ ``` ruby
48
+ Pygments.start("/path/to/pygments")
49
+ ```
50
+
51
+ ## benchmarks
4
52
 
5
53
  $ ruby -rubygems bench.rb 50
6
54
  user system total real
@@ -9,3 +57,5 @@ a ruby wrapper for the pygments syntax highlighter via embedded python.
9
57
  pygments::ffi + reload 11.350000 1.240000 12.590000 ( 12.692320)
10
58
  pygments::ffi 1.130000 0.010000 1.140000 ( 1.171589)
11
59
 
60
+ To run `bench.rb`, use a git checkout. The C extension is not included
61
+ in gem releases.
data/ext/extconf.rb CHANGED
@@ -9,6 +9,6 @@ $CFLAGS << " -Wall "
9
9
  unless python
10
10
  $stderr.puts '*** could not find libpython or Python.h'
11
11
  else
12
- $CFLAGS << " -I/usr/include/python#{python} "
12
+ $defs << "-DPYGMENTS_PYTHON_VERSION=#{python.gsub('.','')}"
13
13
  create_makefile('pygments_ext')
14
14
  end
data/ext/pygments.c CHANGED
@@ -2,7 +2,18 @@
2
2
  #include <stdlib.h>
3
3
 
4
4
  #include <ruby.h>
5
- #include <Python.h>
5
+
6
+ #if PYGMENTS_PYTHON_VERSION == 24
7
+ #include <python2.4/Python.h>
8
+ #elif PYGMENTS_PYTHON_VERSION == 25
9
+ #include <python2.5/Python.h>
10
+ #elif PYGMENTS_PYTHON_VERSION == 26
11
+ #include <python2.6/Python.h>
12
+ #elif PYGMENTS_PYTHON_VERSION == 27
13
+ #include <python2.7/Python.h>
14
+ #else
15
+ #error Unknown python version
16
+ #endif
6
17
 
7
18
  #ifdef RUBY_VM
8
19
  #include <ruby/st.h>
data/lib/pygments/ffi.rb CHANGED
@@ -6,6 +6,7 @@ module Pygments
6
6
 
7
7
  def start(pygments_path = File.expand_path('../../../vendor/pygments-main/', __FILE__))
8
8
  RubyPython.start
9
+ RubyPython.import('pkg_resources') rescue nil
9
10
  sys = RubyPython.import('sys')
10
11
  sys.path.insert(0, pygments_path)
11
12
 
@@ -65,7 +65,7 @@ module Pygments
65
65
  #
66
66
  # Returns the Lexer or nil if none was found.
67
67
  def self.find(name)
68
- @index[name.downcase]
68
+ @index[name.to_s.downcase]
69
69
  end
70
70
 
71
71
  # Public: Alias for find.
@@ -1,3 +1,3 @@
1
1
  module Pygments
2
- VERSION = '0.2.1'
2
+ VERSION = '0.2.2'
3
3
  end
@@ -39,6 +39,7 @@ Other contributors, listed alphabetically, are:
39
39
  * Matthew Harrison -- SVG formatter
40
40
  * Steven Hazel -- Tcl lexer
41
41
  * Aslak Hellesøy -- Gherkin lexer
42
+ * Jordi Gutiérrez Hermoso -- Octave lexer
42
43
  * David Hess, Fish Software, Inc. -- Objective-J lexer
43
44
  * Varun Hiremath -- Debian control lexer
44
45
  * Ben Hollis -- Mason lexer
@@ -78,6 +79,7 @@ Other contributors, listed alphabetically, are:
78
79
  * Ken Schutte -- Matlab lexers
79
80
  * Tassilo Schweyer -- Io, MOOCode lexers
80
81
  * Joerg Sieker -- ABAP lexer
82
+ * Robert Simmons -- Standard ML lexer
81
83
  * Kirill Simonov -- YAML lexer
82
84
  * Steve Spigarelli -- XQuery lexer
83
85
  * Jerome St-Louis -- eC lexer
@@ -90,6 +92,7 @@ Other contributors, listed alphabetically, are:
90
92
  * Dietmar Winkler -- Modelica lexer
91
93
  * Nils Winter -- Smalltalk lexer
92
94
  * Davy Wybiral -- Clojure lexer
95
+ * Diego Zamboni -- CFengine3 lexer
93
96
  * Alex Zimin -- Nemerle lexer
94
97
 
95
98
  Many thanks for all contributions!
@@ -21,6 +21,9 @@ Version 1.5
21
21
  * PostgreSQL (#660)
22
22
  * DTD
23
23
  * Gosu
24
+ * Octave (PR#22)
25
+ * Standard ML (PR#14)
26
+ * CFengine3 (#601)
24
27
 
25
28
  - In the LaTeX formatter, escape special &, < and > chars (#648).
26
29
 
@@ -41,6 +44,8 @@ Version 1.5
41
44
 
42
45
  - Fix generic type highlighting in ActionScript 3 (#666).
43
46
 
47
+ - Fixes to the Clojure lexer (PR#9).
48
+
44
49
 
45
50
  Version 1.4
46
51
  -----------
@@ -1 +1 @@
1
- 456992e7ff81
1
+ db34feabe4b8
@@ -121,7 +121,7 @@ sections, comments and key = value pairs:
121
121
  }
122
122
 
123
123
  The lexer first looks for whitespace, comments and section names. And later it
124
- looks for a line that looks like a key, value pair, seperated by an ``'='``
124
+ looks for a line that looks like a key, value pair, separated by an ``'='``
125
125
  sign, and optional whitespace.
126
126
 
127
127
  The `bygroups` helper makes sure that each group is yielded with a different
@@ -85,7 +85,7 @@ Here a small overview of all allowed styles:
85
85
  ``bold``
86
86
  render text as bold
87
87
  ``nobold``
88
- don't render text as bold (to prevent subtokens behing highlighted bold)
88
+ don't render text as bold (to prevent subtokens being highlighted bold)
89
89
  ``italic``
90
90
  render text italic
91
91
  ``noitalic``
@@ -219,7 +219,7 @@ def main(args=sys.argv):
219
219
  return 0
220
220
 
221
221
  if opts.pop('-V', None) is not None:
222
- print 'Pygments version %s, (c) 2006-2008 by Georg Brandl.' % __version__
222
+ print 'Pygments version %s, (c) 2006-2011 by Georg Brandl.' % __version__
223
223
  return 0
224
224
 
225
225
  # handle ``pygmentize -L``
@@ -286,7 +286,7 @@ class LatexFormatter(Formatter):
286
286
  cp = self.commandprefix
287
287
  styles = []
288
288
  for name, definition in self.cmd2def.iteritems():
289
- styles.append(r'\def\%s@tok@%s{%s}' % (cp, name, definition))
289
+ styles.append(r'\expandafter\def\csname %s@tok@%s\endcsname{%s}' % (cp, name, definition))
290
290
  return STYLE_TEMPLATE % {'cp': self.commandprefix,
291
291
  'styles': '\n'.join(styles)}
292
292
 
@@ -46,6 +46,7 @@ LEXERS = {
46
46
  'CObjdumpLexer': ('pygments.lexers.asm', 'c-objdump', ('c-objdump',), ('*.c-objdump',), ('text/x-c-objdump',)),
47
47
  'CSharpAspxLexer': ('pygments.lexers.dotnet', 'aspx-cs', ('aspx-cs',), ('*.aspx', '*.asax', '*.ascx', '*.ashx', '*.asmx', '*.axd'), ()),
48
48
  'CSharpLexer': ('pygments.lexers.dotnet', 'C#', ('csharp', 'c#'), ('*.cs',), ('text/x-csharp',)),
49
+ 'Cfengine3Lexer': ('pygments.lexers.other', 'CFEngine3', ('cfengine3', 'cf3'), ('*.cf',), ()),
49
50
  'CheetahHtmlLexer': ('pygments.lexers.templates', 'HTML+Cheetah', ('html+cheetah', 'html+spitfire'), (), ('text/html+cheetah', 'text/html+spitfire')),
50
51
  'CheetahJavascriptLexer': ('pygments.lexers.templates', 'JavaScript+Cheetah', ('js+cheetah', 'javascript+cheetah', 'js+spitfire', 'javascript+spitfire'), (), ('application/x-javascript+cheetah', 'text/x-javascript+cheetah', 'text/javascript+cheetah', 'application/x-javascript+spitfire', 'text/x-javascript+spitfire', 'text/javascript+spitfire')),
51
52
  'CheetahLexer': ('pygments.lexers.templates', 'Cheetah', ('cheetah', 'spitfire'), ('*.tmpl', '*.spt'), ('application/x-cheetah', 'application/x-spitfire')),
@@ -135,7 +136,7 @@ LEXERS = {
135
136
  'MakoXmlLexer': ('pygments.lexers.templates', 'XML+Mako', ('xml+mako',), (), ('application/xml+mako',)),
136
137
  'MaqlLexer': ('pygments.lexers.other', 'MAQL', ('maql',), ('*.maql',), ('text/x-gooddata-maql', 'application/x-gooddata-maql')),
137
138
  'MasonLexer': ('pygments.lexers.templates', 'Mason', ('mason',), ('*.m', '*.mhtml', '*.mc', '*.mi', 'autohandler', 'dhandler'), ('application/x-mason',)),
138
- 'MatlabLexer': ('pygments.lexers.math', 'Matlab', ('matlab', 'octave'), ('*.m',), ('text/matlab',)),
139
+ 'MatlabLexer': ('pygments.lexers.math', 'Matlab', ('matlab',), ('*.m',), ('text/matlab',)),
139
140
  'MatlabSessionLexer': ('pygments.lexers.math', 'Matlab session', ('matlabsession',), (), ()),
140
141
  'MiniDLexer': ('pygments.lexers.agile', 'MiniD', ('minid',), ('*.md',), ('text/x-minidsrc',)),
141
142
  'ModelicaLexer': ('pygments.lexers.other', 'Modelica', ('modelica',), ('*.mo',), ('text/x-modelica',)),
@@ -160,6 +161,7 @@ LEXERS = {
160
161
  'ObjectiveJLexer': ('pygments.lexers.web', 'Objective-J', ('objective-j', 'objectivej', 'obj-j', 'objj'), ('*.j',), ('text/x-objective-j',)),
161
162
  'OcamlLexer': ('pygments.lexers.compiled', 'OCaml', ('ocaml',), ('*.ml', '*.mli', '*.mll', '*.mly'), ('text/x-ocaml',)),
162
163
  'OcamlLexer': ('pygments.lexers.functional', 'OCaml', ('ocaml',), ('*.ml', '*.mli', '*.mll', '*.mly'), ('text/x-ocaml',)),
164
+ 'OctaveLexer': ('pygments.lexers.math', 'Octave', ('octave',), ('*.m',), ('text/octave',)),
163
165
  'OocLexer': ('pygments.lexers.compiled', 'Ooc', ('ooc',), ('*.ooc',), ('text/x-ooc',)),
164
166
  'PerlLexer': ('pygments.lexers.agile', 'Perl', ('perl', 'pl'), ('*.pl', '*.pm'), ('text/x-perl', 'application/x-perl')),
165
167
  'PhpLexer': ('pygments.lexers.web', 'PHP', ('php', 'php3', 'php4', 'php5'), ('*.php', '*.php[345]'), ('text/x-php',)),
@@ -194,6 +196,7 @@ LEXERS = {
194
196
  'RubyConsoleLexer': ('pygments.lexers.agile', 'Ruby irb session', ('rbcon', 'irb'), (), ('text/x-ruby-shellsession',)),
195
197
  'RubyLexer': ('pygments.lexers.agile', 'Ruby', ('rb', 'ruby', 'duby'), ('*.rb', '*.rbw', 'Rakefile', '*.rake', '*.gemspec', '*.rbx', '*.duby'), ('text/x-ruby', 'application/x-ruby')),
196
198
  'SLexer': ('pygments.lexers.math', 'S', ('splus', 's', 'r'), ('*.S', '*.R'), ('text/S-plus', 'text/S', 'text/R')),
199
+ 'SMLLexer': ('pygments.lexers.functional', 'Standard ML', ('sml',), ('*.sml', '*.sig', '*.fun'), ('text/x-standardml', 'application/x-standardml')),
197
200
  'SassLexer': ('pygments.lexers.web', 'Sass', ('sass', 'SASS'), ('*.sass',), ('text/x-sass',)),
198
201
  'ScalaLexer': ('pygments.lexers.compiled', 'Scala', ('scala',), ('*.scala',), ('text/x-scala',)),
199
202
  'ScamlLexer': ('pygments.lexers.web', 'Scaml', ('scaml', 'SCAML'), ('*.scaml',), ('text/x-scaml',)),
@@ -13,7 +13,7 @@ import re
13
13
 
14
14
  from pygments.lexer import Lexer, RegexLexer, ExtendedRegexLexer, \
15
15
  LexerContext, include, combined, do_insertions, bygroups, using, this
16
- from pygments.token import Error, Text, Other, \
16
+ from pygments.token import Error, Text, Whitespace, Other, \
17
17
  Comment, Operator, Keyword, Name, String, Number, Generic, Punctuation
18
18
  from pygments.util import get_bool_opt, get_list_opt, shebang_matches
19
19
  from pygments import unistring as uni
@@ -1367,13 +1367,11 @@ class ClojureLexer(RegexLexer):
1367
1367
 
1368
1368
  keywords = [
1369
1369
  'fn', 'def', 'defn', 'defmacro', 'defmethod', 'defmulti', 'defn-',
1370
- 'defstruct',
1371
- 'if', 'cond',
1372
- 'let', 'for'
1370
+ 'defstruct', 'if', 'cond', 'let', 'for'
1373
1371
  ]
1374
1372
  builtins = [
1375
1373
  '.', '..',
1376
- '*', '+', '-', '->', '..', '/', '<', '<=', '=', '==', '>', '>=',
1374
+ '*', '+', '-', '->', '/', '<', '<=', '=', '==', '>', '>=',
1377
1375
  'accessor', 'agent', 'agent-errors', 'aget', 'alength', 'all-ns',
1378
1376
  'alter', 'and', 'append-child', 'apply', 'array-map', 'aset',
1379
1377
  'aset-boolean', 'aset-byte', 'aset-char', 'aset-double', 'aset-float',
@@ -1389,13 +1387,13 @@ class ClojureLexer(RegexLexer):
1389
1387
  'double', 'down', 'drop', 'drop-while', 'edit', 'end?', 'ensure',
1390
1388
  'eval', 'every?', 'false?', 'ffirst', 'file-seq', 'filter', 'find',
1391
1389
  'find-doc', 'find-ns', 'find-var', 'first', 'float', 'flush',
1392
- 'fnseq', 'frest', 'gensym', 'get', 'get-proxy-class',
1390
+ 'fnseq', 'frest', 'gensym', 'get-proxy-class', 'get',
1393
1391
  'hash-map', 'hash-set', 'identical?', 'identity', 'if-let', 'import',
1394
1392
  'in-ns', 'inc', 'index', 'insert-child', 'insert-left', 'insert-right',
1395
1393
  'inspect-table', 'inspect-tree', 'instance?', 'int', 'interleave',
1396
1394
  'intersection', 'into', 'into-array', 'iterate', 'join', 'key', 'keys',
1397
1395
  'keyword', 'keyword?', 'last', 'lazy-cat', 'lazy-cons', 'left',
1398
- 'lefts', 'line-seq', 'list', 'list*', 'load', 'load-file',
1396
+ 'lefts', 'line-seq', 'list*', 'list', 'load', 'load-file',
1399
1397
  'locking', 'long', 'loop', 'macroexpand', 'macroexpand-1',
1400
1398
  'make-array', 'make-node', 'map', 'map-invert', 'map?', 'mapcat',
1401
1399
  'max', 'max-key', 'memfn', 'merge', 'merge-with', 'meta', 'min',
@@ -1426,7 +1424,14 @@ class ClojureLexer(RegexLexer):
1426
1424
  # valid names for identifiers
1427
1425
  # well, names can only not consist fully of numbers
1428
1426
  # but this should be good enough for now
1429
- valid_name = r'[a-zA-Z0-9!$%&*+,/:<=>?@^_~-]+'
1427
+
1428
+ # TODO / should divide keywords/symbols into namespace/rest
1429
+ # but that's hard, so just pretend / is part of the name
1430
+ valid_name = r'[\w!$%*+,<=>?/.-]+'
1431
+
1432
+ def _multi_escape(entries):
1433
+ return '|'.join([re.escape(entry) + '(?![\\w-!$%*+,<=>?/.-])'
1434
+ for entry in entries])
1430
1435
 
1431
1436
  tokens = {
1432
1437
  'root' : [
@@ -1435,42 +1440,29 @@ class ClojureLexer(RegexLexer):
1435
1440
  (r';.*$', Comment.Single),
1436
1441
 
1437
1442
  # whitespaces - usually not relevant
1438
- (r'\s+', Text),
1443
+ (r'[,\s]+', Whitespace),
1439
1444
 
1440
1445
  # numbers
1441
1446
  (r'-?\d+\.\d+', Number.Float),
1442
1447
  (r'-?\d+', Number.Integer),
1443
- # support for uncommon kinds of numbers -
1444
- # have to figure out what the characters mean
1445
- #(r'(#e|#i|#b|#o|#d|#x)[\d.]+', Number),
1448
+ (r'0x-?[abcdef\d]+', Number.Hex),
1446
1449
 
1447
1450
  # strings, symbols and characters
1448
1451
  (r'"(\\\\|\\"|[^"])*"', String),
1449
1452
  (r"'" + valid_name, String.Symbol),
1450
- (r"\\([()/'\".'_!§$%& ?;=#+-]{1}|[a-zA-Z0-9]+)", String.Char),
1453
+ (r"\\(.|[a-z]+)", String.Char),
1451
1454
 
1452
- # constants
1453
- (r'(#t|#f)', Name.Constant),
1455
+ # keywords
1456
+ (r':' + valid_name, Name.Constant),
1454
1457
 
1455
1458
  # special operators
1456
- (r"('|#|`|,@|,|\.)", Operator),
1459
+ (r'~@|[`\'#^~&]', Operator),
1457
1460
 
1458
1461
  # highlight the keywords
1459
- ('(%s)' % '|'.join([
1460
- re.escape(entry) + ' ' for entry in keywords]),
1461
- Keyword
1462
- ),
1463
-
1464
- # first variable in a quoted string like
1465
- # '(this is syntactic sugar)
1466
- (r"(?<='\()" + valid_name, Name.Variable),
1467
- (r"(?<=#\()" + valid_name, Name.Variable),
1462
+ (_multi_escape(keywords), Keyword),
1468
1463
 
1469
1464
  # highlight the builtins
1470
- ("(?<=\()(%s)" % '|'.join([
1471
- re.escape(entry) + ' ' for entry in builtins]),
1472
- Name.Builtin
1473
- ),
1465
+ (_multi_escape(builtins), Name.Builtin),
1474
1466
 
1475
1467
  # the remaining functions
1476
1468
  (r'(?<=\()' + valid_name, Name.Function),
@@ -13,11 +13,12 @@ import re
13
13
 
14
14
  from pygments.lexer import Lexer, RegexLexer, bygroups, include, do_insertions
15
15
  from pygments.token import Text, Comment, Operator, Keyword, Name, \
16
- String, Number, Punctuation, Literal, Generic
16
+ String, Number, Punctuation, Literal, Generic, Error
17
17
 
18
18
 
19
- __all__ = ['SchemeLexer', 'CommonLispLexer', 'HaskellLexer', 'LiterateHaskellLexer',
20
- 'OcamlLexer', 'ErlangLexer', 'ErlangShellLexer']
19
+ __all__ = ['SchemeLexer', 'CommonLispLexer', 'HaskellLexer',
20
+ 'LiterateHaskellLexer', 'SMLLexer', 'OcamlLexer', 'ErlangLexer',
21
+ 'ErlangShellLexer']
21
22
 
22
23
 
23
24
  class SchemeLexer(RegexLexer):
@@ -515,6 +516,329 @@ class LiterateHaskellLexer(Lexer):
515
516
  yield item
516
517
 
517
518
 
519
+ class SMLLexer(RegexLexer):
520
+ """
521
+ For the Standard ML language.
522
+
523
+ *New in Pygments 1.5.*
524
+ """
525
+
526
+ name = 'Standard ML'
527
+ aliases = ['sml']
528
+ filenames = ['*.sml', '*.sig', '*.fun',]
529
+ mimetypes = ['text/x-standardml', 'application/x-standardml']
530
+
531
+ alphanumid_reserved = [
532
+ # Core
533
+ 'abstype', 'and', 'andalso', 'as', 'case', 'datatype', 'do', 'else',
534
+ 'end', 'exception', 'fn', 'fun', 'handle', 'if', 'in', 'infix',
535
+ 'infixr', 'let', 'local', 'nonfix', 'of', 'op', 'open', 'orelse',
536
+ 'raise', 'rec', 'then', 'type', 'val', 'with', 'withtype', 'while',
537
+ # Modules
538
+ 'eqtype', 'functor', 'include', 'sharing', 'sig', 'signature',
539
+ 'struct', 'structure', 'where',
540
+ ]
541
+
542
+ symbolicid_reserved = [
543
+ # Core
544
+ ':', '\|', '=', '=>', '->', '#',
545
+ # Modules
546
+ ':>',
547
+ ]
548
+
549
+ nonid_reserved = [ '(', ')', '[', ']', '{', '}', ',', ';', '...', '_' ]
550
+
551
+ alphanumid_re = r"[a-zA-Z][a-zA-Z0-9_']*"
552
+ symbolicid_re = r"[!%&$#+\-/:<=>?@\\~`^|*]+"
553
+
554
+ # A character constant is a sequence of the form #s, where s is a string
555
+ # constant denoting a string of size one character. This setup just parses
556
+ # the entire string as either a String.Double or a String.Char (depending
557
+ # on the argument), even if the String.Char is an erronous
558
+ # multiple-character string.
559
+ def stringy (whatkind):
560
+ return [
561
+ (r'[^"\\]', whatkind),
562
+ (r'\\[\\\"abtnvfr]', String.Escape),
563
+ (r'\\\^[@-^]', String.Escape),
564
+ (r'\\[0-9]{3}', String.Escape),
565
+ (r'\\u[0-9a-fA-F]{4}', String.Escape),
566
+ (r'\\\s+\\', String.Interpol),
567
+ (r'"', whatkind, '#pop'),
568
+ ]
569
+
570
+ # Callbacks for distinguishing tokens and reserved words
571
+ def long_id_callback(self, match):
572
+ if match.group(1) in self.alphanumid_reserved: token = Error
573
+ else: token = Name.Namespace
574
+ yield match.start(1), token, match.group(1)
575
+ yield match.start(2), Punctuation, match.group(2)
576
+
577
+ def end_id_callback(self, match):
578
+ if match.group(1) in self.alphanumid_reserved: token = Error
579
+ elif match.group(1) in self.symbolicid_reserved: token = Error
580
+ else: token = Name
581
+ yield match.start(1), token, match.group(1)
582
+
583
+ def id_callback(self, match):
584
+ str = match.group(1)
585
+ if str in self.alphanumid_reserved: token = Keyword.Reserved
586
+ elif str in self.symbolicid_reserved: token = Punctuation
587
+ else: token = Name
588
+ yield match.start(1), token, str
589
+
590
+ tokens = {
591
+ # Whitespace and comments are (almost) everywhere
592
+ 'whitespace': [
593
+ (r'\s+', Text),
594
+ (r'\(\*', Comment.Multiline, 'comment'),
595
+ ],
596
+
597
+ 'delimiters': [
598
+ # This lexer treats these delimiters specially:
599
+ # Delimiters define scopes, and the scope is how the meaning of
600
+ # the `|' is resolved - is it a case/handle expression, or function
601
+ # definition by cases? (This is not how the Definition works, but
602
+ # it's how MLton behaves, see http://mlton.org/SMLNJDeviations)
603
+ (r'\(|\[|{', Punctuation, 'main'),
604
+ (r'\)|\]|}', Punctuation, '#pop'),
605
+ (r'\b(let|if|local)\b(?!\')', Keyword.Reserved, ('main', 'main')),
606
+ (r'\b(struct|sig|while)\b(?!\')', Keyword.Reserved, 'main'),
607
+ (r'\b(do|else|end|in|then)\b(?!\')', Keyword.Reserved, '#pop'),
608
+ ],
609
+
610
+ 'core': [
611
+ # Punctuation that doesn't overlap symbolic identifiers
612
+ (r'(%s)' % '|'.join([re.escape(z) for z in nonid_reserved]),
613
+ Punctuation),
614
+
615
+ # Special constants: strings, floats, numbers in decimal and hex
616
+ (r'#"', String.Char, 'char'),
617
+ (r'"', String.Double, 'string'),
618
+ (r'~?0x[0-9a-fA-F]+', Number.Hex),
619
+ (r'0wx[0-9a-fA-F]+', Number.Hex),
620
+ (r'0w\d+', Number.Integer),
621
+ (r'~?\d+\.\d+[eE]~?\d+', Number.Float),
622
+ (r'~?\d+\.\d+', Number.Float),
623
+ (r'~?\d+[eE]~?\d+', Number.Float),
624
+ (r'~?\d+', Number.Integer),
625
+
626
+ # Labels
627
+ (r'#\s*[1-9][0-9]*', Name.Label),
628
+ (r'#\s*(%s)' % alphanumid_re, Name.Label),
629
+ (r'#\s+(%s)' % symbolicid_re, Name.Label),
630
+ # Some reserved words trigger a special, local lexer state change
631
+ (r'\b(datatype|abstype)\b(?!\')', Keyword.Reserved, 'dname'),
632
+ (r'(?=\b(exception)\b(?!\'))', Text, ('ename')),
633
+ (r'\b(functor|include|open|signature|structure)\b(?!\')',
634
+ Keyword.Reserved, 'sname'),
635
+ (r'\b(type|eqtype)\b(?!\')', Keyword.Reserved, 'tname'),
636
+
637
+ # Regular identifiers, long and otherwise
638
+ (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
639
+ (r'(%s)(\.)' % alphanumid_re, long_id_callback, "dotted"),
640
+ (r'(%s)' % alphanumid_re, id_callback),
641
+ (r'(%s)' % symbolicid_re, id_callback),
642
+ ],
643
+ 'dotted': [
644
+ (r'(%s)(\.)' % alphanumid_re, long_id_callback),
645
+ (r'(%s)' % alphanumid_re, end_id_callback, "#pop"),
646
+ (r'(%s)' % symbolicid_re, end_id_callback, "#pop"),
647
+ (r'\s+', Error),
648
+ (r'\S+', Error),
649
+ ],
650
+
651
+
652
+ # Main parser (prevents errors in files that have scoping errors)
653
+ 'root': [ (r'', Text, 'main') ],
654
+
655
+ # In this scope, I expect '|' to not be followed by a function name,
656
+ # and I expect 'and' to be followed by a binding site
657
+ 'main': [
658
+ include('whitespace'),
659
+
660
+ # Special behavior of val/and/fun
661
+ (r'\b(val|and)\b(?!\')', Keyword.Reserved, 'vname'),
662
+ (r'\b(fun)\b(?!\')', Keyword.Reserved,
663
+ ('#pop', 'main-fun', 'fname')),
664
+
665
+ include('delimiters'),
666
+ include('core'),
667
+ (r'\S+', Error),
668
+ ],
669
+
670
+ # In this scope, I expect '|' and 'and' to be followed by a function
671
+ 'main-fun': [
672
+ include('whitespace'),
673
+
674
+ (r'\s', Text),
675
+ (r'\(\*', Comment.Multiline, 'comment'),
676
+
677
+ # Special behavior of val/and/fun
678
+ (r'\b(fun|and)\b(?!\')', Keyword.Reserved, 'fname'),
679
+ (r'\b(val)\b(?!\')', Keyword.Reserved,
680
+ ('#pop', 'main', 'vname')),
681
+
682
+ # Special behavior of '|' and '|'-manipulating keywords
683
+ (r'\|', Punctuation, 'fname'),
684
+ (r'\b(case|handle)\b(?!\')', Keyword.Reserved,
685
+ ('#pop', 'main')),
686
+
687
+ include('delimiters'),
688
+ include('core'),
689
+ (r'\S+', Error),
690
+ ],
691
+
692
+ # Character and string parsers
693
+ 'char': stringy(String.Char),
694
+ 'string': stringy(String.Double),
695
+
696
+ 'breakout': [
697
+ (r'(?=\b(%s)\b(?!\'))' % '|'.join(alphanumid_reserved), Text, '#pop'),
698
+ ],
699
+
700
+ # Dealing with what comes after module system keywords
701
+ 'sname': [
702
+ include('whitespace'),
703
+ include('breakout'),
704
+
705
+ (r'(%s)' % alphanumid_re, Name.Namespace),
706
+ (r'', Text, '#pop'),
707
+ ],
708
+
709
+ # Dealing with what comes after the 'fun' (or 'and' or '|') keyword
710
+ 'fname': [
711
+ include('whitespace'),
712
+ (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
713
+ (r'\(', Punctuation, 'tyvarseq'),
714
+
715
+ (r'(%s)' % alphanumid_re, Name.Function, '#pop'),
716
+ (r'(%s)' % symbolicid_re, Name.Function, '#pop'),
717
+
718
+ # Ignore interesting function declarations like "fun (x + y) = ..."
719
+ (r'', Text, '#pop'),
720
+ ],
721
+
722
+ # Dealing with what comes after the 'val' (or 'and') keyword
723
+ 'vname': [
724
+ include('whitespace'),
725
+ (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
726
+ (r'\(', Punctuation, 'tyvarseq'),
727
+
728
+ (r'(%s)(\s*)(=(?!%s))' % (alphanumid_re, symbolicid_re),
729
+ bygroups(Name.Variable, Text, Punctuation), '#pop'),
730
+ (r'(%s)(\s*)(=(?!%s))' % (symbolicid_re, symbolicid_re),
731
+ bygroups(Name.Variable, Text, Punctuation), '#pop'),
732
+ (r'(%s)' % alphanumid_re, Name.Variable, '#pop'),
733
+ (r'(%s)' % symbolicid_re, Name.Variable, '#pop'),
734
+
735
+ # Ignore interesting patterns like 'val (x, y)'
736
+ (r'', Text, '#pop'),
737
+ ],
738
+
739
+ # Dealing with what comes after the 'type' (or 'and') keyword
740
+ 'tname': [
741
+ include('whitespace'),
742
+ include('breakout'),
743
+
744
+ (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
745
+ (r'\(', Punctuation, 'tyvarseq'),
746
+ (r'=(?!%s)' % symbolicid_re, Punctuation, ('#pop', 'typbind')),
747
+
748
+ (r'(%s)' % alphanumid_re, Keyword.Type),
749
+ (r'(%s)' % symbolicid_re, Keyword.Type),
750
+ (r'\S+', Error, '#pop'),
751
+ ],
752
+
753
+ # A type binding includes most identifiers
754
+ 'typbind': [
755
+ include('whitespace'),
756
+
757
+ (r'\b(and)\b(?!\')', Keyword.Reserved, ('#pop', 'tname')),
758
+
759
+ include('breakout'),
760
+ include('core'),
761
+ (r'\S+', Error, '#pop'),
762
+ ],
763
+
764
+ # Dealing with what comes after the 'datatype' (or 'and') keyword
765
+ 'dname': [
766
+ include('whitespace'),
767
+ include('breakout'),
768
+
769
+ (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
770
+ (r'\(', Punctuation, 'tyvarseq'),
771
+ (r'(=)(\s*)(datatype)',
772
+ bygroups(Punctuation, Text, Keyword.Reserved), '#pop'),
773
+ (r'=(?!%s)' % symbolicid_re, Punctuation,
774
+ ('#pop', 'datbind', 'datcon')),
775
+
776
+ (r'(%s)' % alphanumid_re, Keyword.Type),
777
+ (r'(%s)' % symbolicid_re, Keyword.Type),
778
+ (r'\S+', Error, '#pop'),
779
+ ],
780
+
781
+ # common case - A | B | C of int
782
+ 'datbind': [
783
+ include('whitespace'),
784
+
785
+ (r'\b(and)\b(?!\')', Keyword.Reserved, ('#pop', 'dname')),
786
+ (r'\b(withtype)\b(?!\')', Keyword.Reserved, ('#pop', 'tname')),
787
+ (r'\b(of)\b(?!\')', Keyword.Reserved),
788
+
789
+ (r'(\|)(\s*)(%s)' % alphanumid_re,
790
+ bygroups(Punctuation, Text, Name.Class)),
791
+ (r'(\|)(\s+)(%s)' % symbolicid_re,
792
+ bygroups(Punctuation, Text, Name.Class)),
793
+
794
+ include('breakout'),
795
+ include('core'),
796
+ (r'\S+', Error),
797
+ ],
798
+
799
+ # Dealing with what comes after an exception
800
+ 'ename': [
801
+ include('whitespace'),
802
+
803
+ (r'(exception|and)\b(\s+)(%s)' % alphanumid_re,
804
+ bygroups(Keyword.Reserved, Text, Name.Class)),
805
+ (r'(exception|and)\b(\s*)(%s)' % symbolicid_re,
806
+ bygroups(Keyword.Reserved, Text, Name.Class)),
807
+ (r'\b(of)\b(?!\')', Keyword.Reserved),
808
+
809
+ include('breakout'),
810
+ include('core'),
811
+ (r'\S+', Error),
812
+ ],
813
+
814
+ 'datcon': [
815
+ include('whitespace'),
816
+ (r'(%s)' % alphanumid_re, Name.Class, '#pop'),
817
+ (r'(%s)' % symbolicid_re, Name.Class, '#pop'),
818
+ (r'\S+', Error, '#pop'),
819
+ ],
820
+
821
+ # Series of type variables
822
+ 'tyvarseq': [
823
+ (r'\s', Text),
824
+ (r'\(\*', Comment.Multiline, 'comment'),
825
+
826
+ (r'\'[0-9a-zA-Z_\']*', Name.Decorator),
827
+ (alphanumid_re, Name),
828
+ (r',', Punctuation),
829
+ (r'\)', Punctuation, '#pop'),
830
+ (symbolicid_re, Name),
831
+ ],
832
+
833
+ 'comment': [
834
+ (r'[^(*)]', Comment.Multiline),
835
+ (r'\(\*', Comment.Multiline, '#push'),
836
+ (r'\*\)', Comment.Multiline, '#pop'),
837
+ (r'[(*)]', Comment.Multiline),
838
+ ],
839
+ }
840
+
841
+
518
842
  class OcamlLexer(RegexLexer):
519
843
  """
520
844
  For the OCaml language.