semantic-md 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
semantic_md/cli.py ADDED
@@ -0,0 +1,81 @@
1
+ from urllib.parse import urlparse
2
+ import os.path
3
+ from json import dump
4
+
5
+ import click
6
+
7
+ from semantic_md import convert
8
+
9
+
10
+ def httpish(s):
11
+ """return True if s looks like an HTTP(S) url"""
12
+ try:
13
+ r = urlparse(s)
14
+ return r.scheme.startswith('http') and r.netloc
15
+ except ValueError:
16
+ return False
17
+
18
+
19
+ @click.group()
20
+ def cli():
21
+ pass
22
+
23
+
24
+ @cli.command()
25
+ @click.argument('source', default='-')
26
+ @click.argument('destination', default='-')
27
+ @click.option(
28
+ '-d',
29
+ '--subdirs',
30
+ is_flag=True,
31
+ help='allow accessing semantic-md files in subdirs of SOURCE parent (cwd if SOURCE is -)',
32
+ )
33
+ @click.option(
34
+ '-s',
35
+ '--schema',
36
+ help='specify semantic-md schema (ignore any given in front matter)',
37
+ metavar='SCHEMA',
38
+ )
39
+ @click.option(
40
+ '-x',
41
+ '--http',
42
+ is_flag=True,
43
+ help='allow accessing semantic-md schema files over HTTP',
44
+ )
45
+ def json(source, destination, subdirs, schema, http):
46
+ """Convert SOURCE.md to DESTINATION.json"""
47
+ src = click.open_file(source, 'r', 'utf-8')
48
+ dst = click.open_file(destination, 'w', 'utf-8')
49
+
50
+ front, body = convert.md_parse_front_matter(src.read())
51
+
52
+ if not schema:
53
+ part = front['semantic-md']
54
+ partpath, partname = os.path.split(part)
55
+
56
+ if httpish(part):
57
+ if not http:
58
+ raise click.UsageError('Use -x to allow HTTP schema downloads')
59
+ raise click.Abort('FIXME: not yet implemented')
60
+
61
+ if not subdirs:
62
+ raise click.UsageError('Use -d to allow referencing schemas in subdirs')
63
+
64
+ if source == '-':
65
+ parent = os.getcwd()
66
+ else:
67
+ parent = os.path.abspath(os.path.split(source)[0])
68
+
69
+ if not os.path.isabs(partpath):
70
+ partpath = os.path.join(parent, partpath)
71
+ if not os.path.abspath(partpath).startswith(parent):
72
+ raise click.UsageError(f'Schema {partpath} is not in a subdir of {parent}')
73
+
74
+ schema = os.path.join(partpath, partname)
75
+
76
+ s = convert.Schema.read(click.open_file(schema, 'r', 'utf-8').read())
77
+
78
+ parsed = convert.md_parse_body(body, s)
79
+ out = convert.to_json(parsed, s)
80
+
81
+ dump(out, dst, indent=2)
semantic_md/convert.py ADDED
@@ -0,0 +1,102 @@
1
+ import re
2
+ from yaml import safe_load
3
+
4
+ from mistletoe import Document
5
+ from mistletoe.block_token import Heading
6
+ from mistletoe.markdown_renderer import MarkdownRenderer, BlankLine
7
+
8
+ from .match import HeadingMatch, apply_path_vars, parse_matches
9
+ from .text import NoMatch
10
+
11
+
12
+ class Schema:
13
+ @classmethod
14
+ def read(cls, yaml_str):
15
+ y = safe_load(yaml_str)
16
+
17
+ assert set(y) == {'sections', 'semantic-md-version'}
18
+ schema = cls()
19
+ schema.children = parse_matches(y.get('children', []))
20
+ schema.sections = parse_matches(y.get('sections', []))
21
+
22
+ return schema
23
+
24
+
25
+ class MatchFrame:
26
+ def __init__(self, schema: Schema | HeadingMatch):
27
+ self.schema = schema
28
+ # sections repeat, children only match once
29
+ self.children = iter(schema.children)
30
+
31
+ def __iter__(self):
32
+ yield from self.children
33
+ if self.schema.sections:
34
+ yield from self.schema.sections
35
+
36
+
37
+ def to_json(doc: Document, schema: Schema):
38
+ doc_pos = 0
39
+ heading_level = 0
40
+ schema_stack = [MatchFrame(schema)]
41
+ prefix_stack = ['/']
42
+ json_doc = {}
43
+
44
+ while doc_pos < len(doc.children):
45
+ tok = doc.children[doc_pos]
46
+ if isinstance(tok, BlankLine):
47
+ doc_pos += 1
48
+ continue
49
+
50
+ if isinstance(tok, Heading):
51
+ while tok.level <= heading_level:
52
+ schema_stack.pop()
53
+ prefix_stack.pop()
54
+ heading_level -= 1
55
+
56
+ for match in schema_stack[-1]:
57
+ if result := match.match_md(doc.children, doc_pos):
58
+ doc_pos += result.tokens
59
+ json_doc = match.patch(result, json_doc, ''.join(prefix_stack))
60
+
61
+ if isinstance(match, HeadingMatch):
62
+ heading = match.doc[0]
63
+ # FIXME: enforce these
64
+ assert isinstance(heading, Heading)
65
+ assert heading_level + 1 == heading.level
66
+ heading_level = heading.level
67
+ schema_stack.append(MatchFrame(match))
68
+ prefix_stack.append(
69
+ apply_path_vars(match.patch_path, result.vars_map) + '/'
70
+ if match.patch_path
71
+ else ''
72
+ )
73
+ break
74
+ else:
75
+ with MarkdownRenderer() as renderer:
76
+ raise NoMatch(
77
+ f'line {doc.children[doc_pos].line_number}\n'
78
+ + renderer.render(doc.children[doc_pos])
79
+ )
80
+
81
+ return json_doc
82
+
83
+
84
+ class InputError(Exception):
85
+ pass
86
+
87
+
88
+ def md_parse_front_matter(s):
89
+ parts = re.split(r'^---\s*$', s, 2, flags=re.MULTILINE)
90
+ if len(parts) != 3 or parts[0].strip() or '\n' not in parts[1]:
91
+ raise InputError('expected yaml front matter not found')
92
+
93
+ front = safe_load(parts[1])
94
+ body = parts[2]
95
+
96
+ return front, body
97
+
98
+
99
+ def md_parse_body(body, schema):
100
+ # context manager required for creation of BlankLine, etc.
101
+ with MarkdownRenderer():
102
+ return Document(body)
semantic_md/match.py ADDED
@@ -0,0 +1,366 @@
1
+ import re
2
+ import json
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+
6
+ from jsonpatch import JsonPatch
7
+ from jsonpointer import JsonPointer
8
+ from mistletoe import Document
9
+ from mistletoe.block_token import (
10
+ BlockToken,
11
+ Table,
12
+ TableCell,
13
+ Paragraph,
14
+ Quote,
15
+ BlockCode,
16
+ CodeFence,
17
+ List,
18
+ ListItem,
19
+ HtmlBlock,
20
+ )
21
+ from mistletoe.markdown_renderer import MarkdownRenderer, BlankLine
22
+ from mistletoe.span_token import RawText
23
+ from mistletoe.token import Token
24
+
25
+ from .text import match_content, NoMatch
26
+
27
+ MISTUNE_PLUGINS = ['table', 'def_list']
28
+
29
+ MD_FILTER_VAR = re.compile(r'^\s*{\s*(\w+[\d\w]*)\s*\|\s*md\s*}\s*$')
30
+ MD_FILTER_TYPES = (
31
+ Paragraph,
32
+ Quote,
33
+ BlockCode,
34
+ CodeFence,
35
+ List,
36
+ HtmlBlock,
37
+ BlankLine,
38
+ )
39
+
40
+ LIST_FILTER_VAR = re.compile(r'^\s*{\s*(\w+[\d\w]*)\s*\|\s*list\s*}\s*$')
41
+
42
+
43
+ @dataclass
44
+ class MatchResult:
45
+ tokens: int
46
+ vars_map: dict[str, str]
47
+
48
+
49
+ @dataclass
50
+ class TableMatchResult(MatchResult):
51
+ table_data: list[Token]
52
+
53
+
54
+ def is_md_filter_var(match_token):
55
+ """{var|md}"""
56
+ if (
57
+ isinstance(match_token, Paragraph)
58
+ and len(match_token.children) == 1
59
+ and (md_var := MD_FILTER_VAR.match(match_token.children[0].content))
60
+ ):
61
+ return md_var.group(1)
62
+
63
+
64
+ def is_list_filter_var(match_token):
65
+ """- {var|list}"""
66
+ if (
67
+ isinstance(match_token, List)
68
+ and len(match_token.children) == 1
69
+ and isinstance(li := match_token.children[0], ListItem)
70
+ and len(li.children) == 1
71
+ and isinstance(p := li.children[0], Paragraph)
72
+ and len(p.children) == 1
73
+ and (list_var := LIST_FILTER_VAR.match(p.children[0].content))
74
+ ):
75
+ return list_var.group(1)
76
+
77
+
78
+ def match_content_tree(
79
+ vars_map: dict[str, str | None],
80
+ match_token: Token,
81
+ tokens: list[Token],
82
+ token_pos: int,
83
+ ) -> int:
84
+ """
85
+ returns number of tokens matched by match_token starting
86
+ from tokens[token_pos]. On match updates vars_map in-place.
87
+ """
88
+ if md_var := is_md_filter_var(match_token):
89
+ nonblank = token_pos
90
+ for i in range(token_pos, len(tokens)):
91
+ if not isinstance(tokens[i], MD_FILTER_TYPES):
92
+ break
93
+ if not isinstance(tokens[i], BlankLine):
94
+ nonblank = i
95
+ with MarkdownRenderer() as renderer:
96
+ md = ''.join(
97
+ renderer.render(tokens[j]) for j in range(token_pos, nonblank + 1)
98
+ )
99
+ if vars_map[md_var] is None:
100
+ vars_map[md_var] = md
101
+ elif vars_map[md_var] != md:
102
+ return 0
103
+ return i - token_pos
104
+
105
+ if list_var := is_list_filter_var(match_token):
106
+ # FIXME: handle non-single-paragraph lists
107
+ list_vals = [
108
+ c.children[0].children[0].content for c in tokens[token_pos].children
109
+ ]
110
+ if vars_map[list_var] is None:
111
+ vars_map[list_var] = list_vals
112
+ elif vars_map[list_var] != list_vals:
113
+ return 0
114
+ return 1
115
+
116
+ if not isinstance(tokens[token_pos], type(match_token)):
117
+ return 0
118
+
119
+ if mtoks := match_token.children:
120
+ toks = tokens[token_pos].children
121
+ pos = 0
122
+ for mt in mtoks:
123
+ if pos > len(toks):
124
+ return 0
125
+ if not (matched := match_content_tree(vars_map, mt, toks, pos)):
126
+ return 0
127
+ pos += matched
128
+ if pos < len(toks):
129
+ return 0
130
+
131
+ if isinstance(match_token, RawText) and hasattr(match_token, 'content'):
132
+ try:
133
+ new_vars = match_content(
134
+ vars_map, match_token.content, tokens[token_pos].content
135
+ )
136
+ except NoMatch:
137
+ return 0
138
+ vars_map.update(new_vars)
139
+ return 1
140
+
141
+
142
+ def match_block_tokens(
143
+ match_tokens: list[BlockToken],
144
+ tokens: list[BlockToken],
145
+ token_pos: int,
146
+ vars_: list[str],
147
+ ) -> MatchResult | None:
148
+
149
+ vars_map = {var: None for var in vars_ or []}
150
+ i = 0
151
+ for match_token in match_tokens:
152
+ if token_pos + i >= len(tokens):
153
+ return
154
+
155
+ if not (
156
+ matched := match_content_tree(vars_map, match_token, tokens, token_pos + i)
157
+ ):
158
+ return
159
+ i += matched
160
+
161
+ while token_pos + i < len(tokens) and isinstance(
162
+ tokens[token_pos + i], BlankLine
163
+ ):
164
+ i += 1
165
+
166
+ return MatchResult(i, vars_map)
167
+
168
+
169
+ def match_table_columns(
170
+ cols: list[str],
171
+ row_submatch: dict[str:Any] | None,
172
+ tokens: list[BlockToken],
173
+ pos: int,
174
+ ) -> MatchResult | None:
175
+ vars_map = {}
176
+
177
+ table = tokens[pos]
178
+ if not isinstance(table, Table):
179
+ return
180
+
181
+ if len(table.header.children) != len(cols):
182
+ return
183
+
184
+ try:
185
+ for i, txt in enumerate(cols):
186
+ match_content_tree(vars_map, TableCell(content=txt), table.header.children, i)
187
+ except NoMatch:
188
+ return
189
+
190
+ if row_submatch:
191
+ # FIXME: check that submatch matches
192
+ pass
193
+
194
+ return TableMatchResult(1, vars_map, table.children)
195
+
196
+
197
+ def apply_path_vars(path: str, vars_map: dict[str, str]):
198
+ for var, val in vars_map.items():
199
+ # FIXME: jsonpath escaping for val?
200
+ path = re.sub(r'\$' + var + r'\b', lambda m: val, path)
201
+ return path
202
+
203
+
204
+ def apply_json_patch(
205
+ patch_add: dict[str:Any],
206
+ vars_map: dict[str, str],
207
+ json_doc: dict[str, Any],
208
+ prefix: str,
209
+ ) -> dict[str, Any]:
210
+ filled_patch = {}
211
+ for path, json_value in patch_add.items():
212
+ jv = json.dumps(json_value)
213
+ path = apply_path_vars(path, vars_map)
214
+ for var, val in vars_map.items():
215
+ jv = re.sub(r'(?<!\\)"\$' + var + '"', lambda m: json.dumps(val), jv)
216
+ filled_patch[path] = json.loads(jv)
217
+
218
+ # create missing objects in paths
219
+ prefix_pos = json_doc
220
+ for step in JsonPointer(prefix).get_parts()[:-1]:
221
+ prefix_pos = prefix_pos.setdefault(step, {})
222
+ for path, json_value in filled_patch.items():
223
+ if path:
224
+ path_pos = prefix_pos
225
+ steps = JsonPointer('/' + path).get_parts()
226
+ for i, step in enumerate(steps):
227
+ if steps[-1:] == ['-'] and i == len(steps) - 2:
228
+ path_pos = path_pos.setdefault(step, [])
229
+ break
230
+ path_pos = path_pos.setdefault(step, {})
231
+
232
+ operations = [
233
+ {'op': 'add', 'path': prefix + path, 'value': json_value}
234
+ for path, json_value in filled_patch.items()
235
+ ]
236
+ return JsonPatch(operations).apply(json_doc)
237
+
238
+
239
+ def apply_table_json_patch(
240
+ row_patch_add: dict[str:Any],
241
+ row_submatch: dict[str:Any] | None,
242
+ result: TableMatchResult,
243
+ json_doc: dict[str, Any],
244
+ prefix: str,
245
+ ) -> dict[str, Any]:
246
+ # FIXME: assuming cells are always a single RawText
247
+ for row in result.table_data:
248
+ vars_map = {
249
+ f'{i + 1}': cell.children[0].content for i, cell in enumerate(row.children)
250
+ }
251
+ json_doc = apply_json_patch(row_patch_add, vars_map, json_doc, prefix)
252
+
253
+ for rm, rules in (row_submatch or {}).items():
254
+ src = apply_path_vars(rm, vars_map)
255
+ for rule in rules:
256
+ if flt := rule.get('filter_match'):
257
+ try:
258
+ content_vars = match_content({'content': None}, flt, src)
259
+ except NoMatch:
260
+ continue
261
+ if cmatch := content_vars['content']:
262
+ src = cmatch
263
+ json_doc = apply_json_patch(
264
+ rule['patch_add'], vars_map, json_doc, prefix
265
+ )
266
+
267
+ elif mat := rule.get('match'):
268
+ mvars = {var: None for var in rule['vars'] or []}
269
+ try:
270
+ mvars = match_content(mvars, mat, src)
271
+ except NoMatch:
272
+ continue
273
+ json_doc = apply_json_patch(
274
+ rule['patch_add'], {**vars_map, **mvars}, json_doc, prefix
275
+ )
276
+ return json_doc
277
+
278
+
279
+ class MatchBase:
280
+ def match_md(
281
+ self,
282
+ tokens: list[BlockToken],
283
+ pos: int,
284
+ ) -> MatchResult | None:
285
+ return match_block_tokens(self.doc, tokens, pos, self.vars_)
286
+
287
+ def patch(
288
+ self, result: MatchResult, json_doc: dict[str, Any], prefix: str
289
+ ) -> dict[str, Any]:
290
+ if self.patch_add:
291
+ return apply_json_patch(self.patch_add, result.vars_map, json_doc, prefix)
292
+ return json_doc
293
+
294
+
295
+ @dataclass
296
+ class Match(MatchBase):
297
+ doc: list[BlockToken]
298
+ patch_add: dict[str:Any]
299
+ vars_: list[str]
300
+
301
+
302
+ @dataclass
303
+ class TableMatch(MatchBase):
304
+ cols: list[str]
305
+ row_patch_add: dict[str:Any]
306
+ row_submatch: dict[str:Any] | None = None
307
+
308
+ @property
309
+ def vars_(self):
310
+ return [f'${n + 1}' for n in range(len(self.cols))]
311
+
312
+ def match_md(
313
+ self,
314
+ tokens: list[BlockToken],
315
+ pos: int,
316
+ ) -> TableMatchResult | None:
317
+ return match_table_columns(self.cols, self.row_submatch, tokens, pos)
318
+
319
+ def patch(
320
+ self, result: TableMatchResult, json_doc: dict[str, Any], prefix: str
321
+ ) -> dict[str, Any]:
322
+ return apply_table_json_patch(
323
+ self.row_patch_add,
324
+ self.row_submatch,
325
+ result,
326
+ json_doc,
327
+ prefix,
328
+ )
329
+
330
+
331
+ @dataclass
332
+ class HeadingMatch(MatchBase):
333
+ doc: list[BlockToken]
334
+ patch_path: str | None
335
+ patch_add: dict[str:Any] | None
336
+ vars_: list[str] | None
337
+ children: list[MatchBase] | None = None
338
+ sections: list[MatchBase] | None = None
339
+
340
+
341
+ class UnknownMatch(Exception):
342
+ pass
343
+
344
+
345
+ def parse_match(m):
346
+ if md := m.get('heading_match'):
347
+ doc = Document(md).children
348
+ match = HeadingMatch(doc, m.get('patch_path'), m.get('patch_add'), m.get('vars'))
349
+ if children := m.get('children'):
350
+ match.children = parse_matches(children)
351
+ if sections := m.get('sections'):
352
+ match.sections = parse_matches(sections)
353
+ return match
354
+
355
+ if md := m.get('match'):
356
+ doc = Document(md).children
357
+ return Match(doc, m.get('patch_add'), m.get('vars'))
358
+
359
+ if cols := m.get('table_match'):
360
+ return TableMatch(cols, m['row_patch_add'], m.get('row_submatch'))
361
+
362
+ raise UnknownMatch(m)
363
+
364
+
365
+ def parse_matches(matches):
366
+ return [parse_match(m) for m in matches]
semantic_md/text.py ADDED
@@ -0,0 +1,38 @@
1
+ import re
2
+
3
+
4
+ class NoMatch(Exception):
5
+ pass
6
+
7
+
8
+ def match_content(
9
+ vars_map: dict[str, str | None],
10
+ pattern: str,
11
+ content: str,
12
+ ) -> dict[str, str]:
13
+ """
14
+ Collect/apply vars_map to {key} values in pattern against
15
+ content.
16
+
17
+ Return newly collected values when pattern matches content
18
+ (empty dict is success with no values) otherwise raise NoMatch.
19
+ """
20
+ ep = r'^\s*' + re.escape(pattern) + r'\s*$'
21
+
22
+ # var names are restricted to [a-z][0-9]_
23
+ for var, val in vars_map.items():
24
+ if val is not None:
25
+ ep = ep.replace(r'\{' + var + r'\}', re.escape(val))
26
+ continue
27
+ ep = ep.replace(r'\{' + var + r'\}', '(?P<g' + var + '>.*)', 1)
28
+ ep = ep.replace(r'\{' + var + r'\}', '(?P=g' + var + ')')
29
+
30
+ m = re.match(ep, content)
31
+ if not m:
32
+ raise NoMatch()
33
+
34
+ return {
35
+ var: m.group('g' + var)
36
+ for var, val in vars_map.items()
37
+ if val is None and 'g' + var in m.groupdict()
38
+ }
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.4
2
+ Name: semantic-md
3
+ Version: 0.0.1
4
+ Summary: Semantic Markdown tools
5
+ Project-URL: Homepage, https://github.com/semantic-md/semantic-md
6
+ Project-URL: Issues, https://github.com/semantic-md/semantic-md/issues
7
+ Author-email: Ian Ward <ian@excess.org>
8
+ License-Expression: MIT
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+
14
+ FIXME: write readme
@@ -0,0 +1,9 @@
1
+ semantic_md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ semantic_md/cli.py,sha256=f3_QB20clM2AUpzveweigQHFpLwAR35EqZHvAJwZ4Ts,2175
3
+ semantic_md/convert.py,sha256=LrmIHNWZSkXo_LrOSS2Dn4xeFSoj9K24RDenhUe3mdM,3044
4
+ semantic_md/match.py,sha256=pXNYqN8I8RYVTQDITzShC9eVzcWhjQbPgJG0Edtvp3w,10343
5
+ semantic_md/text.py,sha256=2YYgKwh80IXCXl_0BN6FUxybXjiTpbsAS5kU3o12Fao,986
6
+ semantic_md-0.0.1.dist-info/METADATA,sha256=Bc2Zb1XtfV7UWkIOIaHBdGdZOBSj5AScXNFbt5VjMRI,469
7
+ semantic_md-0.0.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
8
+ semantic_md-0.0.1.dist-info/entry_points.txt,sha256=rugB6lQGPWrx7qfLbGSzymrKpNRnOGX7xeBXpqS4S_Y,44
9
+ semantic_md-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ smd = semantic_md.cli:cli