mdlint 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,585 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "state"
4
+
5
+ module Mdlint
6
+ module Parser
7
+ class BlockParser
8
+ ATX_HEADING_REGEXP = /\A {0,3}(\#{1,6})(?:\s+(.*))?$/
9
+ SETEXT_HEADING_REGEXP = /\A {0,3}(=+|-+)\s*\z/
10
+ FENCE_OPEN_REGEXP = /\A {0,3}(`{3,}|~{3,})([^`]*)\z/
11
+ BLOCKQUOTE_REGEXP = /\A {0,3}> ?/
12
+ HR_REGEXP = /\A {0,3}([-*_])(?:\s*\1){2,}\s*\z/
13
+ BULLET_LIST_REGEXP = /\A( {0,3})([-*+])\s+/
14
+ ORDERED_LIST_REGEXP = /\A( {0,3})(\d{1,9})([.)])\s+/
15
+ CODE_BLOCK_INDENT = /\A {4}/
16
+ HTML_BLOCK_START_1 = /\A {0,3}<(script|pre|style|textarea)[\s>]/i
17
+ HTML_BLOCK_START_2 = /\A {0,3}<!--/
18
+ HTML_BLOCK_START_3 = /\A {0,3}<\?/
19
+ HTML_BLOCK_START_4 = /\A {0,3}<![A-Z]/
20
+ HTML_BLOCK_START_5 = /\A {0,3}<!\[CDATA\[/
21
+ HTML_BLOCK_START_6 = /\A {0,3}<\/?(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|nav|noframes|ol|optgroup|option|p|param|search|section|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(?:\s|\/?>|$)/i
22
+ # Reference definition: [label]: url "title"
23
+ REFERENCE_DEF_REGEXP = /\A {0,3}\[([^\]]+)\]:\s*<?([^\s>]+)>?(?:\s+(?:"([^"]*)"|'([^']*)'|\(([^)]*)\)))?\s*$/
24
+
25
+ def initialize(options = {})
26
+ @options = options
27
+ end
28
+
29
+ def parse(src)
30
+ state = State.new(src)
31
+
32
+ until state.eof?
33
+ parse_block(state)
34
+ end
35
+
36
+ state.tokens
37
+ end
38
+
39
+ private
40
+
41
+ def parse_block(state)
42
+ return if state.eof?
43
+
44
+ parse_blank_line(state) ||
45
+ parse_atx_heading(state) ||
46
+ parse_fence(state) ||
47
+ parse_hr(state) ||
48
+ parse_blockquote(state) ||
49
+ parse_bullet_list(state) ||
50
+ parse_ordered_list(state) ||
51
+ parse_html_block(state) ||
52
+ parse_reference_definition(state) ||
53
+ parse_code_block(state) ||
54
+ parse_setext_heading(state) ||
55
+ parse_paragraph(state)
56
+ end
57
+
58
+ def parse_blank_line(state)
59
+ return false unless state.blank_line?
60
+
61
+ state.next_line
62
+ true
63
+ end
64
+
65
+ def parse_atx_heading(state)
66
+ line = state.current_line
67
+ match = line.match(ATX_HEADING_REGEXP)
68
+ return false unless match
69
+
70
+ level = match[1].length
71
+ content = match[2]&.gsub(/\s+#+\s*\z/, "")&.strip || ""
72
+
73
+ start_line = state.line
74
+
75
+ state.tokens << Token.new(
76
+ type: :heading_open,
77
+ tag: "h#{level}",
78
+ nesting: 1,
79
+ level: state.level,
80
+ markup: "#" * level,
81
+ map: [start_line, start_line + 1]
82
+ )
83
+
84
+ inline_token = Token.new(
85
+ type: :inline,
86
+ content: content,
87
+ level: state.level + 1,
88
+ map: [start_line, start_line + 1]
89
+ )
90
+ state.tokens << inline_token
91
+
92
+ state.tokens << Token.new(
93
+ type: :heading_close,
94
+ tag: "h#{level}",
95
+ nesting: -1,
96
+ level: state.level,
97
+ markup: "#" * level
98
+ )
99
+
100
+ state.next_line
101
+ true
102
+ end
103
+
104
+ def parse_setext_heading(state)
105
+ return false if state.line.zero?
106
+
107
+ line = state.current_line
108
+ return false if line.match?(/\A\s*\z/)
109
+
110
+ next_line = state.peek_line
111
+ return false unless next_line
112
+
113
+ match = next_line.match(SETEXT_HEADING_REGEXP)
114
+ return false unless match
115
+
116
+ level = match[1][0] == "=" ? 1 : 2
117
+ content = line.strip
118
+ start_line = state.line
119
+
120
+ state.tokens << Token.new(
121
+ type: :heading_open,
122
+ tag: "h#{level}",
123
+ nesting: 1,
124
+ level: state.level,
125
+ markup: match[1][0],
126
+ map: [start_line, start_line + 2]
127
+ )
128
+
129
+ state.tokens << Token.new(
130
+ type: :inline,
131
+ content: content,
132
+ level: state.level + 1,
133
+ map: [start_line, start_line + 1]
134
+ )
135
+
136
+ state.tokens << Token.new(
137
+ type: :heading_close,
138
+ tag: "h#{level}",
139
+ nesting: -1,
140
+ level: state.level,
141
+ markup: match[1][0]
142
+ )
143
+
144
+ state.next_line
145
+ state.next_line
146
+ true
147
+ end
148
+
149
+ def parse_fence(state)
150
+ line = state.current_line
151
+ match = line.match(FENCE_OPEN_REGEXP)
152
+ return false unless match
153
+
154
+ marker = match[1]
155
+ info = match[2].strip
156
+ fence_char = marker[0]
157
+ fence_length = marker.length
158
+ start_line = state.line
159
+ state.next_line
160
+
161
+ content_lines = []
162
+ until state.eof?
163
+ current = state.current_line
164
+ close_match = current.match(/\A {0,3}#{fence_char}{#{fence_length},}\s*\z/)
165
+ if close_match
166
+ state.next_line
167
+ break
168
+ end
169
+ content_lines << current
170
+ state.next_line
171
+ end
172
+
173
+ state.tokens << Token.new(
174
+ type: :fence,
175
+ tag: "code",
176
+ content: content_lines.join("\n") + (content_lines.any? ? "\n" : ""),
177
+ markup: marker,
178
+ info: info,
179
+ map: [start_line, state.line]
180
+ )
181
+
182
+ true
183
+ end
184
+
185
+ def parse_hr(state)
186
+ line = state.current_line
187
+ return false unless line.match?(HR_REGEXP)
188
+
189
+ state.tokens << Token.new(
190
+ type: :hr,
191
+ tag: "hr",
192
+ markup: line.strip[0],
193
+ map: [state.line, state.line + 1]
194
+ )
195
+
196
+ state.next_line
197
+ true
198
+ end
199
+
200
+ def parse_blockquote(state)
201
+ line = state.current_line
202
+ return false unless line.match?(BLOCKQUOTE_REGEXP)
203
+
204
+ start_line = state.line
205
+ content_lines = []
206
+
207
+ while !state.eof? && state.current_line.match?(BLOCKQUOTE_REGEXP)
208
+ content_lines << state.current_line.sub(BLOCKQUOTE_REGEXP, "")
209
+ state.next_line
210
+ end
211
+
212
+ state.tokens << Token.new(
213
+ type: :blockquote_open,
214
+ tag: "blockquote",
215
+ nesting: 1,
216
+ level: state.level,
217
+ markup: ">",
218
+ map: [start_line, state.line]
219
+ )
220
+
221
+ state.level += 1
222
+ inner_content = content_lines.join("\n")
223
+ inner_parser = BlockParser.new(@options)
224
+ inner_tokens = inner_parser.parse(inner_content)
225
+
226
+ inner_tokens.each do |token|
227
+ token.level += state.level
228
+ if token.map
229
+ token.map = token.map.map { |l| l + start_line }
230
+ end
231
+ state.tokens << token
232
+ end
233
+
234
+ state.level -= 1
235
+
236
+ state.tokens << Token.new(
237
+ type: :blockquote_close,
238
+ tag: "blockquote",
239
+ nesting: -1,
240
+ level: state.level,
241
+ markup: ">"
242
+ )
243
+
244
+ true
245
+ end
246
+
247
+ def parse_bullet_list(state)
248
+ line = state.current_line
249
+ match = line.match(BULLET_LIST_REGEXP)
250
+ return false unless match
251
+
252
+ marker = match[2]
253
+ start_line = state.line
254
+
255
+ state.tokens << Token.new(
256
+ type: :bullet_list_open,
257
+ tag: "ul",
258
+ nesting: 1,
259
+ level: state.level,
260
+ markup: marker,
261
+ map: [start_line, nil]
262
+ )
263
+ list_token_index = state.tokens.length - 1
264
+
265
+ state.level += 1
266
+ parse_list_items(state, BULLET_LIST_REGEXP, marker)
267
+ state.level -= 1
268
+
269
+ state.tokens[list_token_index].map[1] = state.line
270
+
271
+ state.tokens << Token.new(
272
+ type: :bullet_list_close,
273
+ tag: "ul",
274
+ nesting: -1,
275
+ level: state.level,
276
+ markup: marker
277
+ )
278
+
279
+ true
280
+ end
281
+
282
+ def parse_ordered_list(state)
283
+ line = state.current_line
284
+ match = line.match(ORDERED_LIST_REGEXP)
285
+ return false unless match
286
+
287
+ start_num = match[2].to_i
288
+ delimiter = match[3]
289
+ start_line = state.line
290
+
291
+ state.tokens << Token.new(
292
+ type: :ordered_list_open,
293
+ tag: "ol",
294
+ nesting: 1,
295
+ level: state.level,
296
+ markup: delimiter,
297
+ attrs: { start: start_num },
298
+ map: [start_line, nil]
299
+ )
300
+ list_token_index = state.tokens.length - 1
301
+
302
+ state.level += 1
303
+ parse_ordered_list_items(state, delimiter)
304
+ state.level -= 1
305
+
306
+ state.tokens[list_token_index].map[1] = state.line
307
+
308
+ state.tokens << Token.new(
309
+ type: :ordered_list_close,
310
+ tag: "ol",
311
+ nesting: -1,
312
+ level: state.level,
313
+ markup: delimiter
314
+ )
315
+
316
+ true
317
+ end
318
+
319
+ def parse_list_items(state, pattern, _marker)
320
+ while !state.eof?
321
+ line = state.current_line
322
+ match = line.match(pattern)
323
+ break unless match
324
+
325
+ item_start = state.line
326
+ content = line.sub(pattern, "")
327
+
328
+ state.tokens << Token.new(
329
+ type: :list_item_open,
330
+ tag: "li",
331
+ nesting: 1,
332
+ level: state.level,
333
+ map: [item_start, nil]
334
+ )
335
+ item_token_index = state.tokens.length - 1
336
+
337
+ state.level += 1
338
+ state.next_line
339
+
340
+ item_content_lines = [content]
341
+ while !state.eof? && !state.blank_line? && !state.current_line.match?(pattern)
342
+ if state.current_line.match?(/\A\s+/)
343
+ item_content_lines << state.current_line.sub(/\A\s+/, "")
344
+ state.next_line
345
+ else
346
+ break
347
+ end
348
+ end
349
+
350
+ paragraph_content = item_content_lines.join("\n").strip
351
+ unless paragraph_content.empty?
352
+ state.tokens << Token.new(
353
+ type: :paragraph_open,
354
+ tag: "p",
355
+ nesting: 1,
356
+ level: state.level,
357
+ map: [item_start, state.line]
358
+ )
359
+
360
+ state.tokens << Token.new(
361
+ type: :inline,
362
+ content: paragraph_content,
363
+ level: state.level + 1,
364
+ map: [item_start, state.line]
365
+ )
366
+
367
+ state.tokens << Token.new(
368
+ type: :paragraph_close,
369
+ tag: "p",
370
+ nesting: -1,
371
+ level: state.level
372
+ )
373
+ end
374
+
375
+ state.level -= 1
376
+ state.tokens[item_token_index].map[1] = state.line
377
+
378
+ state.tokens << Token.new(
379
+ type: :list_item_close,
380
+ tag: "li",
381
+ nesting: -1,
382
+ level: state.level
383
+ )
384
+
385
+ state.skip_blank_lines
386
+ end
387
+ end
388
+
389
+ def parse_ordered_list_items(state, delimiter)
390
+ pattern = /\A( {0,3})(\d{1,9})([#{Regexp.escape(delimiter)}])\s+/
391
+
392
+ while !state.eof?
393
+ line = state.current_line
394
+ match = line.match(pattern)
395
+ break unless match
396
+
397
+ item_start = state.line
398
+ content = line.sub(pattern, "")
399
+
400
+ state.tokens << Token.new(
401
+ type: :list_item_open,
402
+ tag: "li",
403
+ nesting: 1,
404
+ level: state.level,
405
+ map: [item_start, nil]
406
+ )
407
+ item_token_index = state.tokens.length - 1
408
+
409
+ state.level += 1
410
+ state.next_line
411
+
412
+ item_content_lines = [content]
413
+ while !state.eof? && !state.blank_line? && !state.current_line.match?(pattern)
414
+ if state.current_line.match?(/\A\s+/)
415
+ item_content_lines << state.current_line.sub(/\A\s+/, "")
416
+ state.next_line
417
+ else
418
+ break
419
+ end
420
+ end
421
+
422
+ paragraph_content = item_content_lines.join("\n").strip
423
+ unless paragraph_content.empty?
424
+ state.tokens << Token.new(
425
+ type: :paragraph_open,
426
+ tag: "p",
427
+ nesting: 1,
428
+ level: state.level,
429
+ map: [item_start, state.line]
430
+ )
431
+
432
+ state.tokens << Token.new(
433
+ type: :inline,
434
+ content: paragraph_content,
435
+ level: state.level + 1,
436
+ map: [item_start, state.line]
437
+ )
438
+
439
+ state.tokens << Token.new(
440
+ type: :paragraph_close,
441
+ tag: "p",
442
+ nesting: -1,
443
+ level: state.level
444
+ )
445
+ end
446
+
447
+ state.level -= 1
448
+ state.tokens[item_token_index].map[1] = state.line
449
+
450
+ state.tokens << Token.new(
451
+ type: :list_item_close,
452
+ tag: "li",
453
+ nesting: -1,
454
+ level: state.level
455
+ )
456
+
457
+ state.skip_blank_lines
458
+ end
459
+ end
460
+
461
+ def parse_html_block(state)
462
+ line = state.current_line
463
+
464
+ return false unless line.match?(HTML_BLOCK_START_1) ||
465
+ line.match?(HTML_BLOCK_START_2) ||
466
+ line.match?(HTML_BLOCK_START_3) ||
467
+ line.match?(HTML_BLOCK_START_4) ||
468
+ line.match?(HTML_BLOCK_START_5) ||
469
+ line.match?(HTML_BLOCK_START_6)
470
+
471
+ start_line = state.line
472
+ content_lines = []
473
+
474
+ until state.eof?
475
+ content_lines << state.current_line
476
+ state.next_line
477
+ break if state.blank_line?
478
+ end
479
+
480
+ state.tokens << Token.new(
481
+ type: :html_block,
482
+ content: content_lines.join("\n") + "\n",
483
+ map: [start_line, state.line]
484
+ )
485
+
486
+ true
487
+ end
488
+
489
+ def parse_code_block(state)
490
+ return false unless state.current_line.match?(CODE_BLOCK_INDENT)
491
+
492
+ start_line = state.line
493
+ content_lines = []
494
+
495
+ while !state.eof? && state.current_line.match?(CODE_BLOCK_INDENT)
496
+ content_lines << state.current_line.sub(CODE_BLOCK_INDENT, "")
497
+ state.next_line
498
+ end
499
+
500
+ state.tokens << Token.new(
501
+ type: :code_block,
502
+ tag: "code",
503
+ content: content_lines.join("\n") + "\n",
504
+ map: [start_line, state.line]
505
+ )
506
+
507
+ true
508
+ end
509
+
510
+ def parse_reference_definition(state)
511
+ line = state.current_line
512
+ match = line.match(REFERENCE_DEF_REGEXP)
513
+ return false unless match
514
+
515
+ label = match[1].downcase
516
+ url = match[2]
517
+ title = match[3] || match[4] || match[5]
518
+
519
+ state.tokens << Token.new(
520
+ type: :reference_definition,
521
+ attrs: {
522
+ label: label,
523
+ url: url,
524
+ title: title
525
+ }.compact,
526
+ map: [state.line, state.line + 1]
527
+ )
528
+
529
+ state.next_line
530
+ true
531
+ end
532
+
533
+ def parse_paragraph(state)
534
+ return false if state.blank_line?
535
+
536
+ start_line = state.line
537
+ content_lines = []
538
+
539
+ while !state.eof? && !state.blank_line?
540
+ line = state.current_line
541
+ break if line.match?(ATX_HEADING_REGEXP) ||
542
+ line.match?(FENCE_OPEN_REGEXP) ||
543
+ line.match?(HR_REGEXP) ||
544
+ line.match?(BLOCKQUOTE_REGEXP) ||
545
+ line.match?(BULLET_LIST_REGEXP) ||
546
+ line.match?(ORDERED_LIST_REGEXP) ||
547
+ line.match?(REFERENCE_DEF_REGEXP)
548
+
549
+ if state.peek_line&.match?(SETEXT_HEADING_REGEXP)
550
+ break if content_lines.any?
551
+ end
552
+
553
+ content_lines << line
554
+ state.next_line
555
+ end
556
+
557
+ return false if content_lines.empty?
558
+
559
+ state.tokens << Token.new(
560
+ type: :paragraph_open,
561
+ tag: "p",
562
+ nesting: 1,
563
+ level: state.level,
564
+ map: [start_line, state.line]
565
+ )
566
+
567
+ state.tokens << Token.new(
568
+ type: :inline,
569
+ content: content_lines.join("\n"),
570
+ level: state.level + 1,
571
+ map: [start_line, state.line]
572
+ )
573
+
574
+ state.tokens << Token.new(
575
+ type: :paragraph_close,
576
+ tag: "p",
577
+ nesting: -1,
578
+ level: state.level
579
+ )
580
+
581
+ true
582
+ end
583
+ end
584
+ end
585
+ end