semantic-md 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ __pycache__
2
+ .*.swp
3
+ dist/
@@ -0,0 +1,13 @@
1
+ .PHONY: test-interactive
2
+ test-interactive:
3
+ export PYTHONBREAKPOINT=ipdb.set_trace
4
+ pytest tests/ -s --pdb -vv
5
+
6
+ check:
7
+ ruff check
8
+
9
+ check-fix:
10
+ ruff check --fix
11
+
12
+ format:
13
+ ruff format
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.4
2
+ Name: semantic-md
3
+ Version: 0.0.1
4
+ Summary: Semantic Markdown tools
5
+ Project-URL: Homepage, https://github.com/semantic-md/semantic-md
6
+ Project-URL: Issues, https://github.com/semantic-md/semantic-md/issues
7
+ Author-email: Ian Ward <ian@excess.org>
8
+ License-Expression: MIT
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+
14
+ FIXME: write readme
@@ -0,0 +1,15 @@
1
+ [[source]]
2
+ url = "https://pypi.org/simple"
3
+ verify_ssl = true
4
+ name = "pypi"
5
+
6
+ [packages]
7
+ pyyaml = "*"
8
+ mistletoe = "*"
9
+ jsonpatch = "*"
10
+ click = "*"
11
+
12
+ [dev-packages]
13
+
14
+ [requires]
15
+ python_version = "3.12"
@@ -0,0 +1,136 @@
1
+ {
2
+ "_meta": {
3
+ "hash": {
4
+ "sha256": "23b44a21a83c1a118df662508c539475644363d276e6c87a0e06517c1134a76b"
5
+ },
6
+ "pipfile-spec": 6,
7
+ "requires": {
8
+ "python_version": "3.12"
9
+ },
10
+ "sources": [
11
+ {
12
+ "name": "pypi",
13
+ "url": "https://pypi.org/simple",
14
+ "verify_ssl": true
15
+ }
16
+ ]
17
+ },
18
+ "default": {
19
+ "click": {
20
+ "hashes": [
21
+ "sha256:40c50b7c6c6adac2823d411041ec84f3f103f1b280d5e9ce0d7f998995832f81",
22
+ "sha256:638f1338fe1235c8f4e008e4a8a254fb5c5fbdcbb40ece3c9142ebb78e792973"
23
+ ],
24
+ "index": "pypi",
25
+ "markers": "python_version >= '3.10'",
26
+ "version": "==8.4.0"
27
+ },
28
+ "jsonpatch": {
29
+ "hashes": [
30
+ "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade",
31
+ "sha256:9fcd4009c41e6d12348b4a0ff2563ba56a2923a7dfee731d004e212e1ee5030c"
32
+ ],
33
+ "index": "pypi",
34
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'",
35
+ "version": "==1.33"
36
+ },
37
+ "jsonpointer": {
38
+ "hashes": [
39
+ "sha256:0b801c7db33a904024f6004d526dcc53bbb8a4a0f4e32bfd10beadf60adf1900",
40
+ "sha256:8ff8b95779d071ba472cf5bc913028df06031797532f08a7d5b602d8b2a488ca"
41
+ ],
42
+ "markers": "python_version >= '3.10'",
43
+ "version": "==3.1.1"
44
+ },
45
+ "mistletoe": {
46
+ "hashes": [
47
+ "sha256:c5571ce6ca9cfdc7ce9151c3ae79acb418e067812000907616427197648030a3",
48
+ "sha256:d3e97664798261503f685f6a6281b092628367cf3128fc68a015a993b0c4feb3"
49
+ ],
50
+ "index": "pypi",
51
+ "markers": "python_version ~= '3.5'",
52
+ "version": "==1.5.1"
53
+ },
54
+ "pyyaml": {
55
+ "hashes": [
56
+ "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c",
57
+ "sha256:0150219816b6a1fa26fb4699fb7daa9caf09eb1999f3b70fb6e786805e80375a",
58
+ "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3",
59
+ "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956",
60
+ "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6",
61
+ "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c",
62
+ "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65",
63
+ "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a",
64
+ "sha256:1ebe39cb5fc479422b83de611d14e2c0d3bb2a18bbcb01f229ab3cfbd8fee7a0",
65
+ "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b",
66
+ "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1",
67
+ "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6",
68
+ "sha256:27c0abcb4a5dac13684a37f76e701e054692a9b2d3064b70f5e4eb54810553d7",
69
+ "sha256:28c8d926f98f432f88adc23edf2e6d4921ac26fb084b028c733d01868d19007e",
70
+ "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007",
71
+ "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310",
72
+ "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4",
73
+ "sha256:3c5677e12444c15717b902a5798264fa7909e41153cdf9ef7ad571b704a63dd9",
74
+ "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295",
75
+ "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea",
76
+ "sha256:418cf3f2111bc80e0933b2cd8cd04f286338bb88bdc7bc8e6dd775ebde60b5e0",
77
+ "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e",
78
+ "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac",
79
+ "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9",
80
+ "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7",
81
+ "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35",
82
+ "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb",
83
+ "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b",
84
+ "sha256:5e0b74767e5f8c593e8c9b5912019159ed0533c70051e9cce3e8b6aa699fcd69",
85
+ "sha256:5ed875a24292240029e4483f9d4a4b8a1ae08843b9c54f43fcc11e404532a8a5",
86
+ "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b",
87
+ "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c",
88
+ "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369",
89
+ "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd",
90
+ "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824",
91
+ "sha256:66291b10affd76d76f54fad28e22e51719ef9ba22b29e1d7d03d6777a9174198",
92
+ "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065",
93
+ "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c",
94
+ "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c",
95
+ "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764",
96
+ "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196",
97
+ "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b",
98
+ "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00",
99
+ "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac",
100
+ "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8",
101
+ "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e",
102
+ "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28",
103
+ "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3",
104
+ "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5",
105
+ "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4",
106
+ "sha256:9c7708761fccb9397fe64bbc0395abcae8c4bf7b0eac081e12b809bf47700d0b",
107
+ "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf",
108
+ "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5",
109
+ "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702",
110
+ "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8",
111
+ "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788",
112
+ "sha256:b865addae83924361678b652338317d1bd7e79b1f4596f96b96c77a5a34b34da",
113
+ "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d",
114
+ "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc",
115
+ "sha256:bdb2c67c6c1390b63c6ff89f210c8fd09d9a1217a465701eac7316313c915e4c",
116
+ "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba",
117
+ "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f",
118
+ "sha256:c3355370a2c156cffb25e876646f149d5d68f5e0a3ce86a5084dd0b64a994917",
119
+ "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5",
120
+ "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26",
121
+ "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f",
122
+ "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b",
123
+ "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be",
124
+ "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c",
125
+ "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3",
126
+ "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6",
127
+ "sha256:fa160448684b4e94d80416c0fa4aac48967a969efe22931448d853ada8baf926",
128
+ "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0"
129
+ ],
130
+ "index": "pypi",
131
+ "markers": "python_version >= '3.8'",
132
+ "version": "==6.0.3"
133
+ }
134
+ },
135
+ "develop": {}
136
+ }
@@ -0,0 +1 @@
1
+ FIXME: write readme
@@ -0,0 +1,34 @@
1
+ [build-system]
2
+ requires = ["hatchling >= 1.26"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "semantic-md"
7
+ version = "0.0.1"
8
+ authors = [
9
+ { name="Ian Ward", email="ian@excess.org" },
10
+ ]
11
+ description = "Semantic Markdown tools"
12
+ readme = "README.md"
13
+ requires-python = ">=3.10"
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "Operating System :: OS Independent",
17
+ ]
18
+ license = "MIT"
19
+ license-files = ["LICEN[CS]E*"]
20
+
21
+ [project.scripts]
22
+ smd = "semantic_md.cli:cli"
23
+
24
+ [project.urls]
25
+ Homepage = "https://github.com/semantic-md/semantic-md"
26
+ Issues = "https://github.com/semantic-md/semantic-md/issues"
27
+
28
+ [tool.ruff]
29
+ line-length = 90
30
+
31
+ [tool.ruff.format]
32
+ quote-style = "single"
33
+ docstring-code-format = true
34
+ line-ending = "lf"
File without changes
@@ -0,0 +1,81 @@
1
+ from urllib.parse import urlparse
2
+ import os.path
3
+ from json import dump
4
+
5
+ import click
6
+
7
+ from semantic_md import convert
8
+
9
+
10
+ def httpish(s):
11
+ """return True if s looks like an HTTP(S) url"""
12
+ try:
13
+ r = urlparse(s)
14
+ return r.scheme.startswith('http') and r.netloc
15
+ except ValueError:
16
+ return False
17
+
18
+
19
+ @click.group()
20
+ def cli():
21
+ pass
22
+
23
+
24
+ @cli.command()
25
+ @click.argument('source', default='-')
26
+ @click.argument('destination', default='-')
27
+ @click.option(
28
+ '-d',
29
+ '--subdirs',
30
+ is_flag=True,
31
+ help='allow accessing semantic-md files in subdirs of SOURCE parent (cwd if SOURCE is -)',
32
+ )
33
+ @click.option(
34
+ '-s',
35
+ '--schema',
36
+ help='specify semantic-md schema (ignore any given in front matter)',
37
+ metavar='SCHEMA',
38
+ )
39
+ @click.option(
40
+ '-x',
41
+ '--http',
42
+ is_flag=True,
43
+ help='allow accessing semantic-md schema files over HTTP',
44
+ )
45
+ def json(source, destination, subdirs, schema, http):
46
+ """Convert SOURCE.md to DESTINATION.json"""
47
+ src = click.open_file(source, 'r', 'utf-8')
48
+ dst = click.open_file(destination, 'w', 'utf-8')
49
+
50
+ front, body = convert.md_parse_front_matter(src.read())
51
+
52
+ if not schema:
53
+ part = front['semantic-md']
54
+ partpath, partname = os.path.split(part)
55
+
56
+ if httpish(part):
57
+ if not http:
58
+ raise click.UsageError('Use -x to allow HTTP schema downloads')
59
+ raise click.Abort('FIXME: not yet implemented')
60
+
61
+ if not subdirs:
62
+ raise click.UsageError('Use -d to allow referencing schemas in subdirs')
63
+
64
+ if source == '-':
65
+ parent = os.getcwd()
66
+ else:
67
+ parent = os.path.abspath(os.path.split(source)[0])
68
+
69
+ if not os.path.isabs(partpath):
70
+ partpath = os.path.join(parent, partpath)
71
+ if not os.path.abspath(partpath).startswith(parent):
72
+ raise click.UsageError(f'Schema {partpath} is not in a subdir of {parent}')
73
+
74
+ schema = os.path.join(partpath, partname)
75
+
76
+ s = convert.Schema.read(click.open_file(schema, 'r', 'utf-8').read())
77
+
78
+ parsed = convert.md_parse_body(body, s)
79
+ out = convert.to_json(parsed, s)
80
+
81
+ dump(out, dst, indent=2)
@@ -0,0 +1,102 @@
1
+ import re
2
+ from yaml import safe_load
3
+
4
+ from mistletoe import Document
5
+ from mistletoe.block_token import Heading
6
+ from mistletoe.markdown_renderer import MarkdownRenderer, BlankLine
7
+
8
+ from .match import HeadingMatch, apply_path_vars, parse_matches
9
+ from .text import NoMatch
10
+
11
+
12
+ class Schema:
13
+ @classmethod
14
+ def read(cls, yaml_str):
15
+ y = safe_load(yaml_str)
16
+
17
+ assert set(y) == {'sections', 'semantic-md-version'}
18
+ schema = cls()
19
+ schema.children = parse_matches(y.get('children', []))
20
+ schema.sections = parse_matches(y.get('sections', []))
21
+
22
+ return schema
23
+
24
+
25
+ class MatchFrame:
26
+ def __init__(self, schema: Schema | HeadingMatch):
27
+ self.schema = schema
28
+ # sections repeat, children only match once
29
+ self.children = iter(schema.children)
30
+
31
+ def __iter__(self):
32
+ yield from self.children
33
+ if self.schema.sections:
34
+ yield from self.schema.sections
35
+
36
+
37
+ def to_json(doc: Document, schema: Schema):
38
+ doc_pos = 0
39
+ heading_level = 0
40
+ schema_stack = [MatchFrame(schema)]
41
+ prefix_stack = ['/']
42
+ json_doc = {}
43
+
44
+ while doc_pos < len(doc.children):
45
+ tok = doc.children[doc_pos]
46
+ if isinstance(tok, BlankLine):
47
+ doc_pos += 1
48
+ continue
49
+
50
+ if isinstance(tok, Heading):
51
+ while tok.level <= heading_level:
52
+ schema_stack.pop()
53
+ prefix_stack.pop()
54
+ heading_level -= 1
55
+
56
+ for match in schema_stack[-1]:
57
+ if result := match.match_md(doc.children, doc_pos):
58
+ doc_pos += result.tokens
59
+ json_doc = match.patch(result, json_doc, ''.join(prefix_stack))
60
+
61
+ if isinstance(match, HeadingMatch):
62
+ heading = match.doc[0]
63
+ # FIXME: enforce these
64
+ assert isinstance(heading, Heading)
65
+ assert heading_level + 1 == heading.level
66
+ heading_level = heading.level
67
+ schema_stack.append(MatchFrame(match))
68
+ prefix_stack.append(
69
+ apply_path_vars(match.patch_path, result.vars_map) + '/'
70
+ if match.patch_path
71
+ else ''
72
+ )
73
+ break
74
+ else:
75
+ with MarkdownRenderer() as renderer:
76
+ raise NoMatch(
77
+ f'line {doc.children[doc_pos].line_number}\n'
78
+ + renderer.render(doc.children[doc_pos])
79
+ )
80
+
81
+ return json_doc
82
+
83
+
84
+ class InputError(Exception):
85
+ pass
86
+
87
+
88
+ def md_parse_front_matter(s):
89
+ parts = re.split(r'^---\s*$', s, 2, flags=re.MULTILINE)
90
+ if len(parts) != 3 or parts[0].strip() or '\n' not in parts[1]:
91
+ raise InputError('expected yaml front matter not found')
92
+
93
+ front = safe_load(parts[1])
94
+ body = parts[2]
95
+
96
+ return front, body
97
+
98
+
99
+ def md_parse_body(body, schema):
100
+ # context manager required for creation of BlankLine, etc.
101
+ with MarkdownRenderer():
102
+ return Document(body)
@@ -0,0 +1,366 @@
1
+ import re
2
+ import json
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+
6
+ from jsonpatch import JsonPatch
7
+ from jsonpointer import JsonPointer
8
+ from mistletoe import Document
9
+ from mistletoe.block_token import (
10
+ BlockToken,
11
+ Table,
12
+ TableCell,
13
+ Paragraph,
14
+ Quote,
15
+ BlockCode,
16
+ CodeFence,
17
+ List,
18
+ ListItem,
19
+ HtmlBlock,
20
+ )
21
+ from mistletoe.markdown_renderer import MarkdownRenderer, BlankLine
22
+ from mistletoe.span_token import RawText
23
+ from mistletoe.token import Token
24
+
25
+ from .text import match_content, NoMatch
26
+
27
+ MISTUNE_PLUGINS = ['table', 'def_list']
28
+
29
+ MD_FILTER_VAR = re.compile(r'^\s*{\s*(\w+[\d\w]*)\s*\|\s*md\s*}\s*$')
30
+ MD_FILTER_TYPES = (
31
+ Paragraph,
32
+ Quote,
33
+ BlockCode,
34
+ CodeFence,
35
+ List,
36
+ HtmlBlock,
37
+ BlankLine,
38
+ )
39
+
40
+ LIST_FILTER_VAR = re.compile(r'^\s*{\s*(\w+[\d\w]*)\s*\|\s*list\s*}\s*$')
41
+
42
+
43
+ @dataclass
44
+ class MatchResult:
45
+ tokens: int
46
+ vars_map: dict[str, str]
47
+
48
+
49
+ @dataclass
50
+ class TableMatchResult(MatchResult):
51
+ table_data: list[Token]
52
+
53
+
54
+ def is_md_filter_var(match_token):
55
+ """{var|md}"""
56
+ if (
57
+ isinstance(match_token, Paragraph)
58
+ and len(match_token.children) == 1
59
+ and (md_var := MD_FILTER_VAR.match(match_token.children[0].content))
60
+ ):
61
+ return md_var.group(1)
62
+
63
+
64
+ def is_list_filter_var(match_token):
65
+ """- {var|list}"""
66
+ if (
67
+ isinstance(match_token, List)
68
+ and len(match_token.children) == 1
69
+ and isinstance(li := match_token.children[0], ListItem)
70
+ and len(li.children) == 1
71
+ and isinstance(p := li.children[0], Paragraph)
72
+ and len(p.children) == 1
73
+ and (list_var := LIST_FILTER_VAR.match(p.children[0].content))
74
+ ):
75
+ return list_var.group(1)
76
+
77
+
78
+ def match_content_tree(
79
+ vars_map: dict[str, str | None],
80
+ match_token: Token,
81
+ tokens: list[Token],
82
+ token_pos: int,
83
+ ) -> int:
84
+ """
85
+ returns number of tokens matched by match_token starting
86
+ from tokens[token_pos]. On match updates vars_map in-place.
87
+ """
88
+ if md_var := is_md_filter_var(match_token):
89
+ nonblank = token_pos
90
+ for i in range(token_pos, len(tokens)):
91
+ if not isinstance(tokens[i], MD_FILTER_TYPES):
92
+ break
93
+ if not isinstance(tokens[i], BlankLine):
94
+ nonblank = i
95
+ with MarkdownRenderer() as renderer:
96
+ md = ''.join(
97
+ renderer.render(tokens[j]) for j in range(token_pos, nonblank + 1)
98
+ )
99
+ if vars_map[md_var] is None:
100
+ vars_map[md_var] = md
101
+ elif vars_map[md_var] != md:
102
+ return 0
103
+ return i - token_pos
104
+
105
+ if list_var := is_list_filter_var(match_token):
106
+ # FIXME: handle non-single-paragraph lists
107
+ list_vals = [
108
+ c.children[0].children[0].content for c in tokens[token_pos].children
109
+ ]
110
+ if vars_map[list_var] is None:
111
+ vars_map[list_var] = list_vals
112
+ elif vars_map[list_var] != list_vals:
113
+ return 0
114
+ return 1
115
+
116
+ if not isinstance(tokens[token_pos], type(match_token)):
117
+ return 0
118
+
119
+ if mtoks := match_token.children:
120
+ toks = tokens[token_pos].children
121
+ pos = 0
122
+ for mt in mtoks:
123
+ if pos > len(toks):
124
+ return 0
125
+ if not (matched := match_content_tree(vars_map, mt, toks, pos)):
126
+ return 0
127
+ pos += matched
128
+ if pos < len(toks):
129
+ return 0
130
+
131
+ if isinstance(match_token, RawText) and hasattr(match_token, 'content'):
132
+ try:
133
+ new_vars = match_content(
134
+ vars_map, match_token.content, tokens[token_pos].content
135
+ )
136
+ except NoMatch:
137
+ return 0
138
+ vars_map.update(new_vars)
139
+ return 1
140
+
141
+
142
+ def match_block_tokens(
143
+ match_tokens: list[BlockToken],
144
+ tokens: list[BlockToken],
145
+ token_pos: int,
146
+ vars_: list[str],
147
+ ) -> MatchResult | None:
148
+
149
+ vars_map = {var: None for var in vars_ or []}
150
+ i = 0
151
+ for match_token in match_tokens:
152
+ if token_pos + i >= len(tokens):
153
+ return
154
+
155
+ if not (
156
+ matched := match_content_tree(vars_map, match_token, tokens, token_pos + i)
157
+ ):
158
+ return
159
+ i += matched
160
+
161
+ while token_pos + i < len(tokens) and isinstance(
162
+ tokens[token_pos + i], BlankLine
163
+ ):
164
+ i += 1
165
+
166
+ return MatchResult(i, vars_map)
167
+
168
+
169
+ def match_table_columns(
170
+ cols: list[str],
171
+ row_submatch: dict[str:Any] | None,
172
+ tokens: list[BlockToken],
173
+ pos: int,
174
+ ) -> MatchResult | None:
175
+ vars_map = {}
176
+
177
+ table = tokens[pos]
178
+ if not isinstance(table, Table):
179
+ return
180
+
181
+ if len(table.header.children) != len(cols):
182
+ return
183
+
184
+ try:
185
+ for i, txt in enumerate(cols):
186
+ match_content_tree(vars_map, TableCell(content=txt), table.header.children, i)
187
+ except NoMatch:
188
+ return
189
+
190
+ if row_submatch:
191
+ # FIXME: check that submatch matches
192
+ pass
193
+
194
+ return TableMatchResult(1, vars_map, table.children)
195
+
196
+
197
+ def apply_path_vars(path: str, vars_map: dict[str, str]):
198
+ for var, val in vars_map.items():
199
+ # FIXME: jsonpath escaping for val?
200
+ path = re.sub(r'\$' + var + r'\b', lambda m: val, path)
201
+ return path
202
+
203
+
204
+ def apply_json_patch(
205
+ patch_add: dict[str:Any],
206
+ vars_map: dict[str, str],
207
+ json_doc: dict[str, Any],
208
+ prefix: str,
209
+ ) -> dict[str, Any]:
210
+ filled_patch = {}
211
+ for path, json_value in patch_add.items():
212
+ jv = json.dumps(json_value)
213
+ path = apply_path_vars(path, vars_map)
214
+ for var, val in vars_map.items():
215
+ jv = re.sub(r'(?<!\\)"\$' + var + '"', lambda m: json.dumps(val), jv)
216
+ filled_patch[path] = json.loads(jv)
217
+
218
+ # create missing objects in paths
219
+ prefix_pos = json_doc
220
+ for step in JsonPointer(prefix).get_parts()[:-1]:
221
+ prefix_pos = prefix_pos.setdefault(step, {})
222
+ for path, json_value in filled_patch.items():
223
+ if path:
224
+ path_pos = prefix_pos
225
+ steps = JsonPointer('/' + path).get_parts()
226
+ for i, step in enumerate(steps):
227
+ if steps[-1:] == ['-'] and i == len(steps) - 2:
228
+ path_pos = path_pos.setdefault(step, [])
229
+ break
230
+ path_pos = path_pos.setdefault(step, {})
231
+
232
+ operations = [
233
+ {'op': 'add', 'path': prefix + path, 'value': json_value}
234
+ for path, json_value in filled_patch.items()
235
+ ]
236
+ return JsonPatch(operations).apply(json_doc)
237
+
238
+
239
+ def apply_table_json_patch(
240
+ row_patch_add: dict[str:Any],
241
+ row_submatch: dict[str:Any] | None,
242
+ result: TableMatchResult,
243
+ json_doc: dict[str, Any],
244
+ prefix: str,
245
+ ) -> dict[str, Any]:
246
+ # FIXME: assuming cells are always a single RawText
247
+ for row in result.table_data:
248
+ vars_map = {
249
+ f'{i + 1}': cell.children[0].content for i, cell in enumerate(row.children)
250
+ }
251
+ json_doc = apply_json_patch(row_patch_add, vars_map, json_doc, prefix)
252
+
253
+ for rm, rules in (row_submatch or {}).items():
254
+ src = apply_path_vars(rm, vars_map)
255
+ for rule in rules:
256
+ if flt := rule.get('filter_match'):
257
+ try:
258
+ content_vars = match_content({'content': None}, flt, src)
259
+ except NoMatch:
260
+ continue
261
+ if cmatch := content_vars['content']:
262
+ src = cmatch
263
+ json_doc = apply_json_patch(
264
+ rule['patch_add'], vars_map, json_doc, prefix
265
+ )
266
+
267
+ elif mat := rule.get('match'):
268
+ mvars = {var: None for var in rule['vars'] or []}
269
+ try:
270
+ mvars = match_content(mvars, mat, src)
271
+ except NoMatch:
272
+ continue
273
+ json_doc = apply_json_patch(
274
+ rule['patch_add'], {**vars_map, **mvars}, json_doc, prefix
275
+ )
276
+ return json_doc
277
+
278
+
279
+ class MatchBase:
280
+ def match_md(
281
+ self,
282
+ tokens: list[BlockToken],
283
+ pos: int,
284
+ ) -> MatchResult | None:
285
+ return match_block_tokens(self.doc, tokens, pos, self.vars_)
286
+
287
+ def patch(
288
+ self, result: MatchResult, json_doc: dict[str, Any], prefix: str
289
+ ) -> dict[str, Any]:
290
+ if self.patch_add:
291
+ return apply_json_patch(self.patch_add, result.vars_map, json_doc, prefix)
292
+ return json_doc
293
+
294
+
295
+ @dataclass
296
+ class Match(MatchBase):
297
+ doc: list[BlockToken]
298
+ patch_add: dict[str:Any]
299
+ vars_: list[str]
300
+
301
+
302
+ @dataclass
303
+ class TableMatch(MatchBase):
304
+ cols: list[str]
305
+ row_patch_add: dict[str:Any]
306
+ row_submatch: dict[str:Any] | None = None
307
+
308
+ @property
309
+ def vars_(self):
310
+ return [f'${n + 1}' for n in range(len(self.cols))]
311
+
312
+ def match_md(
313
+ self,
314
+ tokens: list[BlockToken],
315
+ pos: int,
316
+ ) -> TableMatchResult | None:
317
+ return match_table_columns(self.cols, self.row_submatch, tokens, pos)
318
+
319
+ def patch(
320
+ self, result: TableMatchResult, json_doc: dict[str, Any], prefix: str
321
+ ) -> dict[str, Any]:
322
+ return apply_table_json_patch(
323
+ self.row_patch_add,
324
+ self.row_submatch,
325
+ result,
326
+ json_doc,
327
+ prefix,
328
+ )
329
+
330
+
331
+ @dataclass
332
+ class HeadingMatch(MatchBase):
333
+ doc: list[BlockToken]
334
+ patch_path: str | None
335
+ patch_add: dict[str:Any] | None
336
+ vars_: list[str] | None
337
+ children: list[MatchBase] | None = None
338
+ sections: list[MatchBase] | None = None
339
+
340
+
341
+ class UnknownMatch(Exception):
342
+ pass
343
+
344
+
345
+ def parse_match(m):
346
+ if md := m.get('heading_match'):
347
+ doc = Document(md).children
348
+ match = HeadingMatch(doc, m.get('patch_path'), m.get('patch_add'), m.get('vars'))
349
+ if children := m.get('children'):
350
+ match.children = parse_matches(children)
351
+ if sections := m.get('sections'):
352
+ match.sections = parse_matches(sections)
353
+ return match
354
+
355
+ if md := m.get('match'):
356
+ doc = Document(md).children
357
+ return Match(doc, m.get('patch_add'), m.get('vars'))
358
+
359
+ if cols := m.get('table_match'):
360
+ return TableMatch(cols, m['row_patch_add'], m.get('row_submatch'))
361
+
362
+ raise UnknownMatch(m)
363
+
364
+
365
+ def parse_matches(matches):
366
+ return [parse_match(m) for m in matches]
@@ -0,0 +1,38 @@
1
+ import re
2
+
3
+
4
+ class NoMatch(Exception):
5
+ pass
6
+
7
+
8
+ def match_content(
9
+ vars_map: dict[str, str | None],
10
+ pattern: str,
11
+ content: str,
12
+ ) -> dict[str, str]:
13
+ """
14
+ Collect/apply vars_map to {key} values in pattern against
15
+ content.
16
+
17
+ Return newly collected values when pattern matches content
18
+ (empty dict is success with no values) otherwise raise NoMatch.
19
+ """
20
+ ep = r'^\s*' + re.escape(pattern) + r'\s*$'
21
+
22
+ # var names are restricted to [a-z][0-9]_
23
+ for var, val in vars_map.items():
24
+ if val is not None:
25
+ ep = ep.replace(r'\{' + var + r'\}', re.escape(val))
26
+ continue
27
+ ep = ep.replace(r'\{' + var + r'\}', '(?P<g' + var + '>.*)', 1)
28
+ ep = ep.replace(r'\{' + var + r'\}', '(?P=g' + var + ')')
29
+
30
+ m = re.match(ep, content)
31
+ if not m:
32
+ raise NoMatch()
33
+
34
+ return {
35
+ var: m.group('g' + var)
36
+ for var, val in vars_map.items()
37
+ if val is None and 'g' + var in m.groupdict()
38
+ }
@@ -0,0 +1,107 @@
1
+ # Proposed SemanticMD-based data dictionary format
2
+ #
3
+ # Describes dataset metadata, resource metadata, tables and columns
4
+ # See https://semanticmd.org for more information
5
+
6
+ semantic-md-version: 0
7
+
8
+ sections:
9
+ - heading_match: |
10
+ # Dataset {id}
11
+ vars: [id]
12
+ patch_add:
13
+ datasets/$id: {}
14
+ patch_path: datasets/$id
15
+
16
+ children:
17
+ - match: |
18
+ {desc | md}
19
+ vars: [desc]
20
+ patch_add:
21
+ description: $desc
22
+ - table_match: [Resource, Table, Title]
23
+ row_patch_add:
24
+ resources/-: {"name": $1, "table": $2, "title": $3}
25
+
26
+ - heading_match: |
27
+ # Table {id}
28
+ vars: [id]
29
+ patch_add:
30
+ tables/$id: {}
31
+ patch_path: tables/$id
32
+
33
+ children:
34
+ - table_match: [Column, Type, Label]
35
+ row_patch_add:
36
+ columns/$1/label: $3
37
+ row_submatch:
38
+ $2:
39
+ - filter_match: "required {content}"
40
+ patch_add:
41
+ columns/$1/required: true
42
+ - match: "{typename}"
43
+ vars: [typename]
44
+ patch_add:
45
+ columns/$1/type: $typename
46
+
47
+ - table_match: [Primary key]
48
+ row_patch_add:
49
+ primary_key/-: $1
50
+
51
+ sections:
52
+ - heading_match: |
53
+ ## Column {id}
54
+ vars: [id]
55
+ patch_path: columns/$id
56
+
57
+ children:
58
+ - match: |
59
+ {desc | md}
60
+ vars: [desc]
61
+ patch_add:
62
+ description: $desc
63
+ - match: |
64
+ ### Validation
65
+
66
+ Values >= {min}
67
+ vars: [min]
68
+ patch_add:
69
+ validation: {"gte": $min}
70
+ - match: |
71
+ ### Choices
72
+
73
+ - {vals | list}
74
+ vars: [vals]
75
+ patch_add:
76
+ choices: $vals
77
+
78
+ - heading_match: |
79
+ # Resource {id}
80
+ vars: [id]
81
+ patch_path: resources/$id
82
+
83
+ children:
84
+ - match: |
85
+ {desc | md}
86
+ vars: [desc]
87
+ patch_add:
88
+ description: $desc
89
+
90
+ sections:
91
+ - heading_match: |
92
+ ## Statistics
93
+ children:
94
+ - table_match: [Column, Min, Max, Cardinality, Null Count]
95
+ row_patch_add:
96
+ columns/$1/statistics: {
97
+ "min": $2, "max": $3, "cardinality": $4, "null_count": $5,
98
+ }
99
+ - heading_match: |
100
+ ### Frequency for {col}
101
+ vars: [col]
102
+ patch_path: columns/$col
103
+
104
+ children:
105
+ - table_match: [Choice, Frequency]
106
+ row_patch_add:
107
+ frequency/$1: $2
@@ -0,0 +1,93 @@
1
+ {
2
+ "datasets": {
3
+ "311 NYC Data": {
4
+ "description": "311 NYC dataset example\n\nAnd some extra text for |md to match\n",
5
+ "resources": [
6
+ {
7
+ "name": "NYC_311_SR_2010-2020-sample-1M.csv",
8
+ "table": "311nyc",
9
+ "title": "NYC 311 Data 1M row sample"
10
+ }
11
+ ]
12
+ }
13
+ },
14
+ "tables": {
15
+ "311nyc": {
16
+ "columns": {
17
+ "Unique Key": {
18
+ "label": "Record Identifier",
19
+ "required": true,
20
+ "type": "integer",
21
+ "description": "A unique numeric identifier for each complaint record.\n\nIt is the primary key in the dataset and has 1,000,000 distinct values (100% uniqueness).\n",
22
+ "validation": {
23
+ "gte": "1"
24
+ }
25
+ },
26
+ "Created Date": {
27
+ "label": "Complaint Creation Timestamp",
28
+ "required": true,
29
+ "type": "timestamp",
30
+ "description": "UTC timestamp indicating when a 311 service request was logged.\n\nThe dates span from January\u202f1\u202f2010 to December\u202f23\u202f2020 with a mean around November\u202f10\u202f2015. Approximately 84% of records have missing values.\n"
31
+ },
32
+ "Status": {
33
+ "label": "Complaint Status",
34
+ "required": true,
35
+ "type": "text",
36
+ "description": "Current processing status\u2014Closed, Pending, Open, etc.\n\nClosed complaints dominate (~95\u202f%), with small percentages remaining pending or open.\n",
37
+ "choices": [
38
+ "Assigned",
39
+ "Closed",
40
+ "Closed - Testing",
41
+ "Email Sent",
42
+ "In Progress",
43
+ "Open",
44
+ "Pending",
45
+ "Started",
46
+ "Unassigned",
47
+ "Unspecified"
48
+ ]
49
+ }
50
+ },
51
+ "primary_key": [
52
+ "Unique Key"
53
+ ]
54
+ }
55
+ },
56
+ "resources": {
57
+ "NYC_311_SR_2010-2020-sample-1M.csv": {
58
+ "columns": {
59
+ "Unique Key": {
60
+ "statistics": {
61
+ "min": "11465364",
62
+ "max": "48478173",
63
+ "cardinality": "1000000",
64
+ "null_count": "0"
65
+ }
66
+ },
67
+ "Created Date": {
68
+ "statistics": {
69
+ "min": "2010-01-01T00:00:00+00:00",
70
+ "max": "2020-12-23T01:25:51+00:00",
71
+ "cardinality": "841014",
72
+ "null_count": "0"
73
+ }
74
+ },
75
+ "Status": {
76
+ "statistics": {
77
+ "min": "Assigned",
78
+ "max": "Unspecified",
79
+ "cardinality": "10",
80
+ "null_count": "0"
81
+ },
82
+ "frequency": {
83
+ "Assigned": "6651",
84
+ "In Progress": "7841",
85
+ "Open": "12340",
86
+ "Pending": "20119"
87
+ }
88
+ }
89
+ }
90
+ }
91
+ }
92
+ }
93
+
@@ -0,0 +1,79 @@
1
+ ---
2
+ semantic-md: datadict.yml
3
+ ---
4
+
5
+ # Dataset 311 NYC Data
6
+
7
+ 311 NYC dataset example
8
+
9
+ And some extra text for |md to match
10
+
11
+ | Resource | Table | Title |
12
+ | --- | --- | --- |
13
+ | NYC_311_SR_2010-2020-sample-1M.csv | 311nyc | NYC 311 Data 1M row sample |
14
+
15
+ # Table 311nyc
16
+
17
+ | Column | Type | Label |
18
+ | --- | --- | --- |
19
+ | Unique Key | required integer | Record Identifier |
20
+ | Created Date | required timestamp | Complaint Creation Timestamp |
21
+ | Status | required text | Complaint Status |
22
+
23
+ | Primary key |
24
+ | --- |
25
+ | Unique Key |
26
+
27
+ ## Column Unique Key
28
+
29
+ A unique numeric identifier for each complaint record.
30
+
31
+ It is the primary key in the dataset and has 1,000,000 distinct values (100% uniqueness).
32
+
33
+ ### Validation
34
+
35
+ Values >= 1
36
+
37
+ ## Column Created Date
38
+
39
+ UTC timestamp indicating when a 311 service request was logged.
40
+
41
+ The dates span from January 1 2010 to December 23 2020 with a mean around November 10 2015. Approximately 84% of records have missing values.
42
+
43
+ ## Column Status
44
+
45
+ Current processing status—Closed, Pending, Open, etc.
46
+
47
+ Closed complaints dominate (~95 %), with small percentages remaining pending or open.
48
+
49
+ ### Choices
50
+
51
+ - Assigned
52
+ - Closed
53
+ - Closed - Testing
54
+ - Email Sent
55
+ - In Progress
56
+ - Open
57
+ - Pending
58
+ - Started
59
+ - Unassigned
60
+ - Unspecified
61
+
62
+ # Resource NYC_311_SR_2010-2020-sample-1M.csv
63
+
64
+ ## Statistics
65
+
66
+ | Column | Min | Max | Cardinality | Null Count |
67
+ | --- | ---: | ---: | ---: | ---: |
68
+ | Unique Key | 11465364 | 48478173 | 1000000 | 0 |
69
+ | Created Date | 2010-01-01T00:00:00+00:00 | 2020-12-23T01:25:51+00:00 | 841014 | 0 |
70
+ | Status | Assigned | Unspecified | 10 | 0 |
71
+
72
+ ### Frequency for Status
73
+
74
+ | Choice | Frequency |
75
+ | :--- | ---: |
76
+ | Pending | 20119 |
77
+ | Open | 12340 |
78
+ | In Progress | 7841 |
79
+ | Assigned | 6651 |
@@ -0,0 +1,24 @@
1
+ import os
2
+ import json
3
+
4
+ from yaml import safe_load
5
+
6
+ from semantic_md import convert
7
+
8
+ HERE = os.path.split(__file__)[0]
9
+
10
+
11
+ def test1():
12
+ f = open(os.path.join(HERE, 'example.md')).read()
13
+ front, body = convert.md_parse_front_matter(f)
14
+ assert front['semantic-md'] == 'datadict.yml'
15
+
16
+ schema = safe_load(open(os.path.join(HERE, 'datadict.yml')))
17
+
18
+ parsed = convert.md_parse_body(body, schema)
19
+
20
+ s = open(os.path.join(HERE, 'datadict.yml')).read()
21
+ schema = convert.Schema.read(s)
22
+
23
+ out = convert.to_json(parsed, schema)
24
+ assert out == json.load(open(os.path.join(HERE, 'example.json')))