djot 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,641 +0,0 @@
1
- -- this allows the code to work with both lua and luajit:
2
- local unpack = unpack or table.unpack
3
- local match = require("djot.match")
4
- local attributes = require("djot.attributes")
5
- local make_match, unpack_match, matches_pattern =
6
- match.make_match, match.unpack_match, match.matches_pattern
7
- local find, byte = string.find, string.byte
8
-
9
- -- allow up to 3 captures...
10
- local function bounded_find(subj, patt, startpos, endpos)
11
- local sp,ep,c1,c2,c3 = find(subj, patt, startpos)
12
- if ep and ep <= endpos then
13
- return sp,ep,c1,c2,c3
14
- end
15
- end
16
-
17
- local Parser = {}
18
-
19
- function Parser:new(subject, opts)
20
- local state =
21
- { opts = opts or {}, -- options
22
- subject = subject,
23
- matches = {}, -- table pos : (endpos, annotation)
24
- warnings = {}, -- array of {pos, string} arrays
25
- openers = {}, -- map from closer_type to array of (pos, data) in reverse order
26
- verbatim = 0, -- parsing verbatim span to be ended by n backticks
27
- verbatim_type = nil, -- whether verbatim is math or regular
28
- destination = false, -- parsing link destination in ()
29
- firstpos = 0, -- position of first slice
30
- lastpos = 0, -- position of last slice
31
- allow_attributes = true, -- allow parsing of attributes
32
- attribute_parser = nil, -- attribute parser
33
- attribute_start = nil, -- start of potential attribute
34
- attribute_slices = nil, -- slices we've tried to parse as attributes
35
- }
36
- setmetatable(state, self)
37
- self.__index = self
38
- return state
39
- end
40
-
41
- function Parser:add_match(startpos, endpos, annotation)
42
- self.matches[startpos] = make_match(startpos, endpos, annotation)
43
- end
44
-
45
- function Parser:add_opener(name, ...)
46
- -- 1 = startpos, 2 = endpos, 3 = annotation, 4 = substartpos, 5 = endpos
47
- if not self.openers[name] then
48
- self.openers[name] = {}
49
- end
50
- table.insert(self.openers[name], {...})
51
- end
52
-
53
- function Parser:clear_openers(startpos, endpos)
54
- -- remove other openers in between the matches
55
- for _,v in pairs(self.openers) do
56
- local i = #v
57
- while v[i] do
58
- local sp,ep,_,sp2,ep2 = unpack(v[i])
59
- if sp >= startpos and ep <= endpos then
60
- v[i] = nil
61
- elseif (sp2 and sp2 >= startpos) and (ep2 and ep2 <= endpos) then
62
- v[i][3] = nil
63
- v[i][4] = nil
64
- v[i][5] = nil
65
- else
66
- break
67
- end
68
- i = i - 1
69
- end
70
- end
71
- end
72
-
73
- function Parser:str_matches(startpos, endpos)
74
- for i = startpos, endpos do
75
- local m = self.matches[i]
76
- if m then
77
- local sp, ep, annot = unpack_match(m)
78
- if annot ~= "str" and annot ~= "escape" then
79
- self.matches[i] = make_match(sp, ep, "str")
80
- end
81
- end
82
- end
83
- end
84
-
85
- function Parser.between_matched(c, annotation, defaultmatch, opentest)
86
- return function(self, pos)
87
- local defaultmatch = defaultmatch or "str"
88
- local subject = self.subject
89
- local can_open = find(subject, "^%S", pos + 1)
90
- local can_close = find(subject, "^%S", pos - 1)
91
- local has_open_marker = matches_pattern(self.matches[pos - 1], "^open%_marker")
92
- local has_close_marker = byte(subject, pos + 1) == 125 -- }
93
- local endcloser = pos
94
- local startopener = pos
95
-
96
- if type(opentest) == "function" then
97
- can_open = can_open and opentest(self, pos)
98
- end
99
-
100
- -- allow explicit open/close markers to override:
101
- if has_open_marker then
102
- can_open = true
103
- can_close = false
104
- startopener = pos - 1
105
- end
106
- if not has_open_marker and has_close_marker then
107
- can_close = true
108
- can_open = false
109
- endcloser = pos + 1
110
- end
111
-
112
- if has_open_marker and defaultmatch:match("^right") then
113
- defaultmatch = defaultmatch:gsub("^right", "left")
114
- elseif has_close_marker and defaultmatch:match("^left") then
115
- defaultmatch = defaultmatch:gsub("^left", "right")
116
- end
117
-
118
- local openers = self.openers[c]
119
- local matched = false
120
- if can_close and openers and #openers > 0 then
121
- -- check openers for a match
122
- local openpos, openposend = unpack(openers[#openers])
123
- if openposend ~= pos - 1 then -- exclude empty emph
124
- self:clear_openers(openpos, pos)
125
- self:add_match(openpos, openposend, "+" .. annotation)
126
- self:add_match(pos, endcloser, "-" .. annotation)
127
- return endcloser + 1
128
- end
129
- end
130
- -- if we get here, we didn't match an opener
131
- if can_open then
132
- self:add_opener(c, startopener, pos)
133
- self:add_match(startopener, pos, defaultmatch)
134
- return pos + 1
135
- else
136
- self:add_match(pos, endcloser, defaultmatch)
137
- return endcloser + 1
138
- end
139
- end
140
- end
141
-
142
- Parser.matchers = {
143
- -- 96 = `
144
- [96] = function(self, pos, endpos)
145
- local subject = self.subject
146
- local _, endchar = bounded_find(subject, "^`*", pos, endpos)
147
- if not endchar then
148
- return nil
149
- end
150
- if find(subject, "^%$%$", pos - 2) then
151
- self.matches[pos - 2] = nil
152
- self.matches[pos - 1] = nil
153
- self:add_match(pos - 2, endchar, "+display_math")
154
- self.verbatim_type = "display_math"
155
- elseif find(subject, "^%$", pos - 1) then
156
- self.matches[pos - 1] = nil
157
- self:add_match(pos - 1, endchar, "+inline_math")
158
- self.verbatim_type = "inline_math"
159
- else
160
- self:add_match(pos, endchar, "+verbatim")
161
- self.verbatim_type = "verbatim"
162
- end
163
- self.verbatim = endchar - pos + 1
164
- return endchar + 1
165
- end,
166
-
167
- -- 92 = \
168
- [92] = function(self, pos, endpos)
169
- local subject = self.subject
170
- local _, endchar = bounded_find(subject, "^[ \t]*\r?\n", pos + 1, endpos)
171
- self:add_match(pos, pos, "escape")
172
- if endchar then
173
- -- see if there were preceding spaces
174
- if #self.matches > 0 then
175
- local sp, ep, annot = unpack_match(self.matches[#self.matches])
176
- if annot == "str" then
177
- while subject:byte(ep) == 32 or subject:byte(ep) == 9 do
178
- ep = ep -1
179
- end
180
- if sp == ep then
181
- self.matches[#self.matches] = nil
182
- else
183
- self:add_match(sp, ep, "str")
184
- end
185
- end
186
- end
187
- self:add_match(pos + 1, endchar, "hardbreak")
188
- return endchar + 1
189
- else
190
- local _, ec = bounded_find(subject, "^[%p ]", pos + 1, endpos)
191
- if not ec then
192
- self:add_match(pos, pos, "str")
193
- return pos + 1
194
- else
195
- self:add_match(pos, pos, "escape")
196
- if find(subject, "^ ", pos + 1) then
197
- self:add_match(pos + 1, ec, "nbsp")
198
- else
199
- self:add_match(pos + 1, ec, "str")
200
- end
201
- return ec + 1
202
- end
203
- end
204
- end,
205
-
206
- -- 60 = <
207
- [60] = function(self, pos, endpos)
208
- local subject = self.subject
209
- local starturl, endurl =
210
- bounded_find(subject, "^%<[^<>%s]+%>", pos, endpos)
211
- if starturl then
212
- local is_url = bounded_find(subject, "^%a+:", pos + 1, endurl)
213
- local is_email = bounded_find(subject, "^[^:]+%@", pos + 1, endurl)
214
- if is_email then
215
- self:add_match(starturl, starturl, "+email")
216
- self:add_match(starturl + 1, endurl - 1, "str")
217
- self:add_match(endurl, endurl, "-email")
218
- return endurl + 1
219
- elseif is_url then
220
- self:add_match(starturl, starturl, "+url")
221
- self:add_match(starturl + 1, endurl - 1, "str")
222
- self:add_match(endurl, endurl, "-url")
223
- return endurl + 1
224
- end
225
- end
226
- end,
227
-
228
- -- 126 = ~
229
- [126] = Parser.between_matched('~', 'subscript'),
230
-
231
- -- 94 = ^
232
- [94] = Parser.between_matched('^', 'superscript'),
233
-
234
- -- 91 = [
235
- [91] = function(self, pos, endpos)
236
- local sp, ep = bounded_find(self.subject, "^%^([^]]+)%]", pos + 1, endpos)
237
- if sp then -- footnote ref
238
- self:add_match(pos, ep, "footnote_reference")
239
- return ep + 1
240
- else
241
- self:add_opener("[", pos, pos)
242
- self:add_match(pos, pos, "str")
243
- return pos + 1
244
- end
245
- end,
246
-
247
- -- 93 = ]
248
- [93] = function(self, pos, endpos)
249
- local openers = self.openers["["]
250
- local subject = self.subject
251
- if openers and #openers > 0 then
252
- local opener = openers[#openers]
253
- if opener[3] == "reference_link" then
254
- -- found a reference link
255
- -- add the matches
256
- local subject = self.subject
257
- local is_image = bounded_find(subject, "^!", opener[1] - 1, endpos)
258
- and not bounded_find(subject, "^[\\]", opener[1] - 2, endpos)
259
- if is_image then
260
- self:add_match(opener[1] - 1, opener[1] - 1, "image_marker")
261
- self:add_match(opener[1], opener[2], "+imagetext")
262
- self:add_match(opener[4], opener[5], "-imagetext")
263
- else
264
- self:add_match(opener[1], opener[2], "+linktext")
265
- self:add_match(opener[4], opener[5], "-linktext")
266
- end
267
- self:add_match(opener[5], opener[5], "+reference")
268
- self:add_match(pos, pos, "-reference")
269
- -- convert all matches to str
270
- self:str_matches(opener[5] + 1, pos - 1)
271
- -- remove from openers
272
- self:clear_openers(opener[1], pos)
273
- return pos + 1
274
- elseif bounded_find(subject, "^%[", pos + 1, endpos) then
275
- opener[3] = "reference_link"
276
- opener[4] = pos -- intermediate ]
277
- opener[5] = pos + 1 -- intermediate [
278
- self:add_match(pos, pos + 1, "str")
279
- return pos + 2
280
- elseif bounded_find(subject, "^%(", pos + 1, endpos) then
281
- self.openers["("] = {} -- clear ( openers
282
- opener[3] = "explicit_link"
283
- opener[4] = pos -- intermediate ]
284
- opener[5] = pos + 1 -- intermediate (
285
- self.destination = true
286
- self:add_match(pos, pos + 1, "str")
287
- return pos + 2
288
- elseif bounded_find(subject, "^%{", pos + 1, endpos) then
289
- -- assume this is attributes, bracketed span
290
- self:add_match(opener[1], opener[2], "+span")
291
- self:add_match(pos, pos, "-span")
292
- self:clear_openers(opener[1], pos)
293
- return pos + 1
294
- end
295
- end
296
- end,
297
-
298
-
299
- -- 40 = (
300
- [40] = function(self, pos)
301
- if not self.destination then return nil end
302
- self:add_opener("(", pos, pos)
303
- self:add_match(pos, pos, "str")
304
- return pos + 1
305
- end,
306
-
307
- -- 41 = )
308
- [41] = function(self, pos, endpos)
309
- if not self.destination then return nil end
310
- local parens = self.openers["("]
311
- if parens and #parens > 0 and parens[#parens][1] then
312
- parens[#parens] = nil -- clear opener
313
- self:add_match(pos, pos, "str")
314
- return pos + 1
315
- else
316
- local subject = self.subject
317
- local openers = self.openers["["]
318
- if openers and #openers > 0
319
- and openers[#openers][3] == "explicit_link" then
320
- local opener = openers[#openers]
321
- local startdest, enddest = opener[5], pos
322
- -- we have inline link
323
- local is_image = bounded_find(subject, "^!", opener[1] - 1, endpos)
324
- and not bounded_find(subject, "^[\\]", opener[1] - 2, endpos)
325
- if is_image then
326
- self:add_match(opener[1] - 1, opener[1] - 1, "image_marker")
327
- self:add_match(opener[1], opener[2], "+imagetext")
328
- self:add_match(opener[4], opener[4], "-imagetext")
329
- else
330
- self:add_match(opener[1], opener[2], "+linktext")
331
- self:add_match(opener[4], opener[4], "-linktext")
332
- end
333
- self:add_match(startdest, startdest, "+destination")
334
- self:add_match(enddest, enddest, "-destination")
335
- self.destination = false
336
- -- convert all matches to str
337
- self:str_matches(opener[5] + 1, pos - 1)
338
- -- remove from openers
339
- self:clear_openers(opener[2], pos)
340
- return enddest + 1
341
- end
342
- end
343
- end,
344
-
345
- -- 95 = _
346
- [95] = Parser.between_matched('_', 'emph'),
347
-
348
- -- 42 = *
349
- [42] = Parser.between_matched('*', 'strong'),
350
-
351
- -- 123 = {
352
- [123] = function(self, pos, endpos)
353
- if bounded_find(self.subject, "^[_*~^+='\"-]", pos + 1, endpos) then
354
- self:add_match(pos, pos, "open_marker")
355
- return pos + 1
356
- elseif self.allow_attributes then
357
- self.attribute_parser = attributes.AttributeParser:new(self.subject)
358
- self.attribute_start = pos
359
- self.attribute_slices = {}
360
- return pos
361
- else
362
- self:add_match(pos, pos, "str")
363
- return pos + 1
364
- end
365
- end,
366
-
367
- -- 58 = :
368
- [58] = function(self, pos, endpos)
369
- local sp, ep = bounded_find(self.subject, "^%:[%w_+-]+%:", pos, endpos)
370
- if sp then
371
- self:add_match(sp, ep, "emoji")
372
- return ep + 1
373
- else
374
- self:add_match(pos, pos, "str")
375
- return pos + 1
376
- end
377
- end,
378
-
379
- -- 43 = +
380
- [43] = Parser.between_matched("+", "insert", "str",
381
- function(self, pos)
382
- return find(self.subject, "^%{", pos - 1) or
383
- find(self.subject, "^%}", pos + 1)
384
- end),
385
-
386
- -- 61 = =
387
- [61] = Parser.between_matched("=", "mark", "str",
388
- function(self, pos)
389
- return find(self.subject, "^%{", pos - 1) or
390
- find(self.subject, "^%}", pos + 1)
391
- end),
392
-
393
- -- 39 = '
394
- [39] = Parser.between_matched("'", "single_quoted", "right_single_quote",
395
- function(self, pos) -- test to open
396
- return pos == 1 or
397
- find(self.subject, "^[%s\"'-([]", pos - 1)
398
- end),
399
-
400
- -- 34 = "
401
- [34] = Parser.between_matched('"', "double_quoted", "left_double_quote"),
402
-
403
- -- 45 = -
404
- [45] = function(self, pos, endpos)
405
- local subject = self.subject
406
- local _, ep = find(subject, "^%-*", pos)
407
- local hyphens
408
- if endpos < ep then
409
- hyphens = 1 + endpos - pos
410
- else
411
- hyphens = 1 + ep - pos
412
- end
413
- if byte(subject, ep + 1) == 125 then -- }
414
- hyphens = hyphens - 1 -- last hyphen is close del
415
- end
416
- if byte(subject, pos - 1) == 123 or byte(subject, pos + 1) == 125 then
417
- return Parser.between_matched("-", "delete")(self, pos, endpos)
418
- end
419
- -- Try to construct a homogeneous sequence of dashes
420
- local all_em = hyphens % 3 == 0
421
- local all_en = hyphens % 2 == 0
422
- while hyphens > 0 do
423
- if all_em then
424
- self:add_match(pos, pos + 2, "em_dash")
425
- pos = pos + 3
426
- hyphens = hyphens - 3
427
- elseif all_en then
428
- self:add_match(pos, pos + 1, "en_dash")
429
- pos = pos + 2
430
- hyphens = hyphens - 2
431
- elseif hyphens >= 3 and (hyphens % 2 ~= 0 or hyphens > 4) then
432
- self:add_match(pos, pos + 2, "em_dash")
433
- pos = pos + 3
434
- hyphens = hyphens - 3
435
- elseif hyphens >= 2 then
436
- self:add_match(pos, pos + 1, "en_dash")
437
- pos = pos + 2
438
- hyphens = hyphens - 2
439
- else
440
- self:add_match(pos, pos, "str")
441
- pos = pos + 1
442
- hyphens = hyphens - 1
443
- end
444
- end
445
- return pos
446
- end,
447
-
448
- -- 46 = .
449
- [46] = function(self, pos, endpos)
450
- if bounded_find(self.subject, "^%.%.", pos + 1, endpos) then
451
- self:add_match(pos, pos +2, "ellipses")
452
- return pos + 3
453
- end
454
- end
455
- }
456
-
457
- function Parser:single_char(pos)
458
- self:add_match(pos, pos, "str")
459
- return pos + 1
460
- end
461
-
462
- -- Feed a slice to the parser, updating state.
463
- function Parser:feed(spos, endpos)
464
- local special = "[][\\`{}_*()!<>~^:=+$\r\n'\".-]"
465
- local subject = self.subject
466
- local matchers = self.matchers
467
- local pos
468
- if self.firstpos == 0 or spos < self.firstpos then
469
- self.firstpos = spos
470
- end
471
- if self.lastpos == 0 or endpos > self.lastpos then
472
- self.lastpos = endpos
473
- end
474
- pos = spos
475
- while pos <= endpos do
476
- if self.attribute_parser then
477
- local sp = pos
478
- local ep2 = bounded_find(subject, special, pos, endpos) or endpos
479
- local status, ep = self.attribute_parser:feed(sp, ep2)
480
- if status == "done" then
481
- local attribute_start = self.attribute_start
482
- -- add attribute matches
483
- self:add_match(attribute_start, attribute_start, "+attributes")
484
- self:add_match(ep, ep, "-attributes")
485
- local attr_matches = self.attribute_parser:get_matches()
486
- -- add attribute matches
487
- for i=1,#attr_matches do
488
- self:add_match(unpack_match(attr_matches[i]))
489
- end
490
- -- restore state to prior to adding attribute parser:
491
- self.attribute_parser = nil
492
- self.attribute_start = nil
493
- self.attribute_slices = nil
494
- pos = ep + 1
495
- elseif status == "fail" then
496
- -- backtrack:
497
- local slices = self.attribute_slices
498
- self.allow_attributes = false
499
- self.attribute_parser = nil
500
- self.attribute_start = nil
501
- for i=1,#slices do
502
- self:feed(unpack(slices[i]))
503
- end
504
- self.allow_attributes = true
505
- self.slices = nil
506
- pos = sp
507
- elseif status == "continue" then
508
- self.attribute_slices[#self.attribute_slices + 1] = {sp,ep}
509
- pos = ep + 1
510
- end
511
- else
512
- -- find next interesting character:
513
- local newpos = bounded_find(subject, special, pos, endpos) or endpos + 1
514
- if newpos > pos then
515
- self:add_match(pos, newpos - 1, "str")
516
- pos = newpos
517
- if pos > endpos then
518
- break -- otherwise, fall through:
519
- end
520
- end
521
- -- if we get here, then newpos = pos,
522
- -- i.e. we have something interesting at pos
523
- local c = byte(subject, pos)
524
-
525
- if c == 13 or c == 10 then -- cr or lf
526
- if c == 13 and bounded_find(subject, "^[%n]", pos + 1, endpos) then
527
- self:add_match(pos, pos + 1, "softbreak")
528
- pos = pos + 2
529
- else
530
- self:add_match(pos, pos, "softbreak")
531
- pos = pos + 1
532
- end
533
- elseif self.verbatim > 0 then
534
- if c == 96 then
535
- local _, endchar = bounded_find(subject, "^`+", pos, endpos)
536
- if endchar and endchar - pos + 1 == self.verbatim then
537
- -- check for raw attribute
538
- local sp, ep =
539
- bounded_find(subject, "^%{%=[^%s{}`]+%}", endchar + 1, endpos)
540
- if sp and self.verbatim_type == "verbatim" then -- raw
541
- self:add_match(pos, endchar, "-" .. self.verbatim_type)
542
- self:add_match(sp, ep, "raw_format")
543
- pos = ep + 1
544
- else
545
- self:add_match(pos, endchar, "-" .. self.verbatim_type)
546
- pos = endchar + 1
547
- end
548
- self.verbatim = 0
549
- self.verbatim_type = nil
550
- else
551
- endchar = endchar or endpos
552
- self:add_match(pos, endchar, "str")
553
- pos = endchar + 1
554
- end
555
- else
556
- self:add_match(pos, pos, "str")
557
- pos = pos + 1
558
- end
559
- else
560
- pos = (matchers[c] and matchers[c](self, pos, endpos))
561
- or self:single_char(pos)
562
- end
563
- end
564
- end
565
- end
566
-
567
- -- Return true if we're parsing verbatim content.
568
- function Parser:in_verbatim()
569
- return self.verbatim > 0
570
- end
571
-
572
- -- Return parse results and any warnings.
573
- function Parser:get_matches()
574
- local sorted = {}
575
- local subject = self.subject
576
- local lastsp, lastep, lastannot
577
- for i=self.firstpos, self.lastpos do
578
- if self.matches[i] then
579
- local sp, ep, annot = unpack_match(self.matches[i])
580
- if annot == "str" and lastannot == "str" and lastep + 1 == sp then
581
- -- consolidate adjacent strs
582
- sorted[#sorted] = make_match(lastsp, ep, annot)
583
- lastsp, lastep, lastannot = lastsp, ep, annot
584
- else
585
- sorted[#sorted + 1] = self.matches[i]
586
- lastsp, lastep, lastannot = sp, ep, annot
587
- end
588
- end
589
- end
590
- if #sorted > 0 then
591
- local last = sorted[#sorted]
592
- local startpos, endpos, annot = unpack_match(last)
593
- -- remove final softbreak
594
- if annot == "softbreak" then
595
- sorted[#sorted] = nil
596
- last = sorted[#sorted]
597
- startpos, endpos, annot = unpack_match(last)
598
- end
599
- -- remove trailing spaces
600
- if annot == "str" and byte(subject, endpos) == 32 then
601
- while endpos > startpos and byte(subject, endpos) == 32 do
602
- endpos = endpos - 1
603
- end
604
- sorted[#sorted] = make_match(startpos, endpos, annot)
605
- end
606
- if self.verbatim > 0 then -- unclosed verbatim
607
- self.warnings[#self.warnings + 1] =
608
- {startpos, "Unclosed verbatim"}
609
- sorted[#sorted + 1] = make_match(startpos, endpos,
610
- "-" .. self.verbatim_type)
611
- end
612
- end
613
- return sorted, self.warnings
614
- end
615
-
616
- return { Parser = Parser }
617
-
618
-
619
- --[[
620
- Copyright (C) 2022 John MacFarlane
621
-
622
- Permission is hereby granted, free of charge, to any person obtaining
623
- a copy of this software and associated documentation files (the
624
- "Software"), to deal in the Software without restriction, including
625
- without limitation the rights to use, copy, modify, merge, publish,
626
- distribute, sublicense, and/or sell copies of the Software, and to
627
- permit persons to whom the Software is furnished to do so, subject to
628
- the following conditions:
629
-
630
- The above copyright notice and this permission notice shall be included
631
- in all copies or substantial portions of the Software.
632
-
633
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
634
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
635
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
636
- IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
637
- CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
638
- TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
639
- SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
640
-
641
- ]]