PyPI - semantic-md - Versions diffs - 0.0.1__tar.gz - Mend

semantic-md 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

semantic_md-0.0.1/.gitignore +3 -0
semantic_md-0.0.1/Makefile +13 -0
semantic_md-0.0.1/PKG-INFO +14 -0
semantic_md-0.0.1/Pipfile +15 -0
semantic_md-0.0.1/Pipfile.lock +136 -0
semantic_md-0.0.1/README.md +1 -0
semantic_md-0.0.1/pyproject.toml +34 -0
semantic_md-0.0.1/semantic_md/__init__.py +0 -0
semantic_md-0.0.1/semantic_md/cli.py +81 -0
semantic_md-0.0.1/semantic_md/convert.py +102 -0
semantic_md-0.0.1/semantic_md/match.py +366 -0
semantic_md-0.0.1/semantic_md/text.py +38 -0
semantic_md-0.0.1/tests/datadict.yml +107 -0
semantic_md-0.0.1/tests/example.json +93 -0
semantic_md-0.0.1/tests/example.md +79 -0
semantic_md-0.0.1/tests/test_datadict.py +24 -0

semantic_md-0.0.1/.gitignore ADDED Viewed

@@ -0,0 +1,3 @@
+__pycache__
+.*.swp
+dist/

semantic_md-0.0.1/Makefile ADDED Viewed

@@ -0,0 +1,13 @@
+.PHONY: test-interactive
+test-interactive:
+	export PYTHONBREAKPOINT=ipdb.set_trace
+	pytest tests/ -s --pdb -vv
+check:
+	ruff check
+check-fix:
+	ruff check --fix
+format:
+	ruff format

semantic_md-0.0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,14 @@
+Metadata-Version: 2.4
+Name: semantic-md
+Version: 0.0.1
+Summary: Semantic Markdown tools
+Project-URL: Homepage, https://github.com/semantic-md/semantic-md
+Project-URL: Issues, https://github.com/semantic-md/semantic-md/issues
+Author-email: Ian Ward <ian@excess.org>
+License-Expression: MIT
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+FIXME: write readme

semantic_md-0.0.1/Pipfile ADDED Viewed

@@ -0,0 +1,15 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+[packages]
+pyyaml = "*"
+mistletoe = "*"
+jsonpatch = "*"
+click = "*"
+[dev-packages]
+[requires]
+python_version = "3.12"

semantic_md-0.0.1/Pipfile.lock ADDED Viewed

@@ -0,0 +1,136 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "23b44a21a83c1a118df662508c539475644363d276e6c87a0e06517c1134a76b"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.12"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "click": {
+            "hashes": [
+                "sha256:40c50b7c6c6adac2823d411041ec84f3f103f1b280d5e9ce0d7f998995832f81",
+                "sha256:638f1338fe1235c8f4e008e4a8a254fb5c5fbdcbb40ece3c9142ebb78e792973"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.10'",
+            "version": "==8.4.0"
+        },
+        "jsonpatch": {
+            "hashes": [
+                "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade",
+                "sha256:9fcd4009c41e6d12348b4a0ff2563ba56a2923a7dfee731d004e212e1ee5030c"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'",
+            "version": "==1.33"
+        },
+        "jsonpointer": {
+            "hashes": [
+                "sha256:0b801c7db33a904024f6004d526dcc53bbb8a4a0f4e32bfd10beadf60adf1900",
+                "sha256:8ff8b95779d071ba472cf5bc913028df06031797532f08a7d5b602d8b2a488ca"
+            ],
+            "markers": "python_version >= '3.10'",
+            "version": "==3.1.1"
+        },
+        "mistletoe": {
+            "hashes": [
+                "sha256:c5571ce6ca9cfdc7ce9151c3ae79acb418e067812000907616427197648030a3",
+                "sha256:d3e97664798261503f685f6a6281b092628367cf3128fc68a015a993b0c4feb3"
+            ],
+            "index": "pypi",
+            "markers": "python_version ~= '3.5'",
+            "version": "==1.5.1"
+        },
+        "pyyaml": {
+            "hashes": [
+                "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c",
+                "sha256:0150219816b6a1fa26fb4699fb7daa9caf09eb1999f3b70fb6e786805e80375a",
+                "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3",
+                "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956",
+                "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6",
+                "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c",
+                "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65",
+                "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a",
+                "sha256:1ebe39cb5fc479422b83de611d14e2c0d3bb2a18bbcb01f229ab3cfbd8fee7a0",
+                "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b",
+                "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1",
+                "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6",
+                "sha256:27c0abcb4a5dac13684a37f76e701e054692a9b2d3064b70f5e4eb54810553d7",
+                "sha256:28c8d926f98f432f88adc23edf2e6d4921ac26fb084b028c733d01868d19007e",
+                "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007",
+                "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310",
+                "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4",
+                "sha256:3c5677e12444c15717b902a5798264fa7909e41153cdf9ef7ad571b704a63dd9",
+                "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295",
+                "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea",
+                "sha256:418cf3f2111bc80e0933b2cd8cd04f286338bb88bdc7bc8e6dd775ebde60b5e0",
+                "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e",
+                "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac",
+                "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9",
+                "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7",
+                "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35",
+                "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb",
+                "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b",
+                "sha256:5e0b74767e5f8c593e8c9b5912019159ed0533c70051e9cce3e8b6aa699fcd69",
+                "sha256:5ed875a24292240029e4483f9d4a4b8a1ae08843b9c54f43fcc11e404532a8a5",
+                "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b",
+                "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c",
+                "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369",
+                "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd",
+                "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824",
+                "sha256:66291b10affd76d76f54fad28e22e51719ef9ba22b29e1d7d03d6777a9174198",
+                "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065",
+                "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c",
+                "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c",
+                "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764",
+                "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196",
+                "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b",
+                "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00",
+                "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac",
+                "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8",
+                "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e",
+                "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28",
+                "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3",
+                "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5",
+                "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4",
+                "sha256:9c7708761fccb9397fe64bbc0395abcae8c4bf7b0eac081e12b809bf47700d0b",
+                "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf",
+                "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5",
+                "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702",
+                "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8",
+                "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788",
+                "sha256:b865addae83924361678b652338317d1bd7e79b1f4596f96b96c77a5a34b34da",
+                "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d",
+                "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc",
+                "sha256:bdb2c67c6c1390b63c6ff89f210c8fd09d9a1217a465701eac7316313c915e4c",
+                "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba",
+                "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f",
+                "sha256:c3355370a2c156cffb25e876646f149d5d68f5e0a3ce86a5084dd0b64a994917",
+                "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5",
+                "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26",
+                "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f",
+                "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b",
+                "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be",
+                "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c",
+                "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3",
+                "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6",
+                "sha256:fa160448684b4e94d80416c0fa4aac48967a969efe22931448d853ada8baf926",
+                "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.8'",
+            "version": "==6.0.3"
+        }
+    },
+    "develop": {}
+}

semantic_md-0.0.1/README.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ FIXME: write readme

semantic_md-0.0.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,34 @@
+[build-system]
+requires = ["hatchling >= 1.26"]
+build-backend = "hatchling.build"
+[project]
+name = "semantic-md"
+version = "0.0.1"
+authors = [
+  { name="Ian Ward", email="ian@excess.org" },
+]
+description = "Semantic Markdown tools"
+readme = "README.md"
+requires-python = ">=3.10"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+]
+license = "MIT"
+license-files = ["LICEN[CS]E*"]
+[project.scripts]
+smd = "semantic_md.cli:cli"
+[project.urls]
+Homepage = "https://github.com/semantic-md/semantic-md"
+Issues = "https://github.com/semantic-md/semantic-md/issues"
+[tool.ruff]
+line-length = 90
+[tool.ruff.format]
+quote-style = "single"
+docstring-code-format = true
+line-ending = "lf"

semantic_md-0.0.1/semantic_md/__init__.py ADDED Viewed

File without changes

semantic_md-0.0.1/semantic_md/cli.py ADDED Viewed

@@ -0,0 +1,81 @@
+from urllib.parse import urlparse
+import os.path
+from json import dump
+import click
+from semantic_md import convert
+def httpish(s):
+    """return True if s looks like an HTTP(S) url"""
+    try:
+        r = urlparse(s)
+        return r.scheme.startswith('http') and r.netloc
+    except ValueError:
+        return False
+@click.group()
+def cli():
+    pass
+@cli.command()
+@click.argument('source', default='-')
+@click.argument('destination', default='-')
+@click.option(
+    '-d',
+    '--subdirs',
+    is_flag=True,
+    help='allow accessing semantic-md files in subdirs of SOURCE parent (cwd if SOURCE is -)',
+)
+@click.option(
+    '-s',
+    '--schema',
+    help='specify semantic-md schema (ignore any given in front matter)',
+    metavar='SCHEMA',
+)
+@click.option(
+    '-x',
+    '--http',
+    is_flag=True,
+    help='allow accessing semantic-md schema files over HTTP',
+)
+def json(source, destination, subdirs, schema, http):
+    """Convert SOURCE.md to DESTINATION.json"""
+    src = click.open_file(source, 'r', 'utf-8')
+    dst = click.open_file(destination, 'w', 'utf-8')
+    front, body = convert.md_parse_front_matter(src.read())
+    if not schema:
+        part = front['semantic-md']
+        partpath, partname = os.path.split(part)
+        if httpish(part):
+            if not http:
+                raise click.UsageError('Use -x to allow HTTP schema downloads')
+            raise click.Abort('FIXME: not yet implemented')
+        if not subdirs:
+            raise click.UsageError('Use -d to allow referencing schemas in subdirs')
+        if source == '-':
+            parent = os.getcwd()
+        else:
+            parent = os.path.abspath(os.path.split(source)[0])
+            if not os.path.isabs(partpath):
+                partpath = os.path.join(parent, partpath)
+        if not os.path.abspath(partpath).startswith(parent):
+            raise click.UsageError(f'Schema {partpath} is not in a subdir of {parent}')
+        schema = os.path.join(partpath, partname)
+    s = convert.Schema.read(click.open_file(schema, 'r', 'utf-8').read())
+    parsed = convert.md_parse_body(body, s)
+    out = convert.to_json(parsed, s)
+    dump(out, dst, indent=2)

semantic_md-0.0.1/semantic_md/convert.py ADDED Viewed

@@ -0,0 +1,102 @@
+import re
+from yaml import safe_load
+from mistletoe import Document
+from mistletoe.block_token import Heading
+from mistletoe.markdown_renderer import MarkdownRenderer, BlankLine
+from .match import HeadingMatch, apply_path_vars, parse_matches
+from .text import NoMatch
+class Schema:
+    @classmethod
+    def read(cls, yaml_str):
+        y = safe_load(yaml_str)
+        assert set(y) == {'sections', 'semantic-md-version'}
+        schema = cls()
+        schema.children = parse_matches(y.get('children', []))
+        schema.sections = parse_matches(y.get('sections', []))
+        return schema
+class MatchFrame:
+    def __init__(self, schema: Schema | HeadingMatch):
+        self.schema = schema
+        # sections repeat, children only match once
+        self.children = iter(schema.children)
+    def __iter__(self):
+        yield from self.children
+        if self.schema.sections:
+            yield from self.schema.sections
+def to_json(doc: Document, schema: Schema):
+    doc_pos = 0
+    heading_level = 0
+    schema_stack = [MatchFrame(schema)]
+    prefix_stack = ['/']
+    json_doc = {}
+    while doc_pos < len(doc.children):
+        tok = doc.children[doc_pos]
+        if isinstance(tok, BlankLine):
+            doc_pos += 1
+            continue
+        if isinstance(tok, Heading):
+            while tok.level <= heading_level:
+                schema_stack.pop()
+                prefix_stack.pop()
+                heading_level -= 1
+        for match in schema_stack[-1]:
+            if result := match.match_md(doc.children, doc_pos):
+                doc_pos += result.tokens
+                json_doc = match.patch(result, json_doc, ''.join(prefix_stack))
+                if isinstance(match, HeadingMatch):
+                    heading = match.doc[0]
+                    # FIXME: enforce these
+                    assert isinstance(heading, Heading)
+                    assert heading_level + 1 == heading.level
+                    heading_level = heading.level
+                    schema_stack.append(MatchFrame(match))
+                    prefix_stack.append(
+                        apply_path_vars(match.patch_path, result.vars_map) + '/'
+                        if match.patch_path
+                        else ''
+                    )
+                break
+        else:
+            with MarkdownRenderer() as renderer:
+                raise NoMatch(
+                    f'line {doc.children[doc_pos].line_number}\n'
+                    + renderer.render(doc.children[doc_pos])
+                )
+    return json_doc
+class InputError(Exception):
+    pass
+def md_parse_front_matter(s):
+    parts = re.split(r'^---\s*$', s, 2, flags=re.MULTILINE)
+    if len(parts) != 3 or parts[0].strip() or '\n' not in parts[1]:
+        raise InputError('expected yaml front matter not found')
+    front = safe_load(parts[1])
+    body = parts[2]
+    return front, body
+def md_parse_body(body, schema):
+    # context manager required for creation of BlankLine, etc.
+    with MarkdownRenderer():
+        return Document(body)

semantic_md-0.0.1/semantic_md/match.py ADDED Viewed

@@ -0,0 +1,366 @@
+import re
+import json
+from dataclasses import dataclass
+from typing import Any
+from jsonpatch import JsonPatch
+from jsonpointer import JsonPointer
+from mistletoe import Document
+from mistletoe.block_token import (
+    BlockToken,
+    Table,
+    TableCell,
+    Paragraph,
+    Quote,
+    BlockCode,
+    CodeFence,
+    List,
+    ListItem,
+    HtmlBlock,
+)
+from mistletoe.markdown_renderer import MarkdownRenderer, BlankLine
+from mistletoe.span_token import RawText
+from mistletoe.token import Token
+from .text import match_content, NoMatch
+MISTUNE_PLUGINS = ['table', 'def_list']
+MD_FILTER_VAR = re.compile(r'^\s*{\s*(\w+[\d\w]*)\s*\|\s*md\s*}\s*$')
+MD_FILTER_TYPES = (
+    Paragraph,
+    Quote,
+    BlockCode,
+    CodeFence,
+    List,
+    HtmlBlock,
+    BlankLine,
+)
+LIST_FILTER_VAR = re.compile(r'^\s*{\s*(\w+[\d\w]*)\s*\|\s*list\s*}\s*$')
+@dataclass
+class MatchResult:
+    tokens: int
+    vars_map: dict[str, str]
+@dataclass
+class TableMatchResult(MatchResult):
+    table_data: list[Token]
+def is_md_filter_var(match_token):
+    """{var|md}"""
+    if (
+        isinstance(match_token, Paragraph)
+        and len(match_token.children) == 1
+        and (md_var := MD_FILTER_VAR.match(match_token.children[0].content))
+    ):
+        return md_var.group(1)
+def is_list_filter_var(match_token):
+    """- {var|list}"""
+    if (
+        isinstance(match_token, List)
+        and len(match_token.children) == 1
+        and isinstance(li := match_token.children[0], ListItem)
+        and len(li.children) == 1
+        and isinstance(p := li.children[0], Paragraph)
+        and len(p.children) == 1
+        and (list_var := LIST_FILTER_VAR.match(p.children[0].content))
+    ):
+        return list_var.group(1)
+def match_content_tree(
+    vars_map: dict[str, str | None],
+    match_token: Token,
+    tokens: list[Token],
+    token_pos: int,
+) -> int:
+    """
+    returns number of tokens matched by match_token starting
+    from tokens[token_pos]. On match updates vars_map in-place.
+    """
+    if md_var := is_md_filter_var(match_token):
+        nonblank = token_pos
+        for i in range(token_pos, len(tokens)):
+            if not isinstance(tokens[i], MD_FILTER_TYPES):
+                break
+            if not isinstance(tokens[i], BlankLine):
+                nonblank = i
+        with MarkdownRenderer() as renderer:
+            md = ''.join(
+                renderer.render(tokens[j]) for j in range(token_pos, nonblank + 1)
+            )
+        if vars_map[md_var] is None:
+            vars_map[md_var] = md
+        elif vars_map[md_var] != md:
+            return 0
+        return i - token_pos
+    if list_var := is_list_filter_var(match_token):
+        # FIXME: handle non-single-paragraph lists
+        list_vals = [
+            c.children[0].children[0].content for c in tokens[token_pos].children
+        ]
+        if vars_map[list_var] is None:
+            vars_map[list_var] = list_vals
+        elif vars_map[list_var] != list_vals:
+            return 0
+        return 1
+    if not isinstance(tokens[token_pos], type(match_token)):
+        return 0
+    if mtoks := match_token.children:
+        toks = tokens[token_pos].children
+        pos = 0
+        for mt in mtoks:
+            if pos > len(toks):
+                return 0
+            if not (matched := match_content_tree(vars_map, mt, toks, pos)):
+                return 0
+            pos += matched
+        if pos < len(toks):
+            return 0
+    if isinstance(match_token, RawText) and hasattr(match_token, 'content'):
+        try:
+            new_vars = match_content(
+                vars_map, match_token.content, tokens[token_pos].content
+            )
+        except NoMatch:
+            return 0
+        vars_map.update(new_vars)
+    return 1
+def match_block_tokens(
+    match_tokens: list[BlockToken],
+    tokens: list[BlockToken],
+    token_pos: int,
+    vars_: list[str],
+) -> MatchResult | None:
+    vars_map = {var: None for var in vars_ or []}
+    i = 0
+    for match_token in match_tokens:
+        if token_pos + i >= len(tokens):
+            return
+        if not (
+            matched := match_content_tree(vars_map, match_token, tokens, token_pos + i)
+        ):
+            return
+        i += matched
+        while token_pos + i < len(tokens) and isinstance(
+            tokens[token_pos + i], BlankLine
+        ):
+            i += 1
+    return MatchResult(i, vars_map)
+def match_table_columns(
+    cols: list[str],
+    row_submatch: dict[str:Any] | None,
+    tokens: list[BlockToken],
+    pos: int,
+) -> MatchResult | None:
+    vars_map = {}
+    table = tokens[pos]
+    if not isinstance(table, Table):
+        return
+    if len(table.header.children) != len(cols):
+        return
+    try:
+        for i, txt in enumerate(cols):
+            match_content_tree(vars_map, TableCell(content=txt), table.header.children, i)
+    except NoMatch:
+        return
+    if row_submatch:
+        # FIXME: check that submatch matches
+        pass
+    return TableMatchResult(1, vars_map, table.children)
+def apply_path_vars(path: str, vars_map: dict[str, str]):
+    for var, val in vars_map.items():
+        # FIXME: jsonpath escaping for val?
+        path = re.sub(r'\$' + var + r'\b', lambda m: val, path)
+    return path
+def apply_json_patch(
+    patch_add: dict[str:Any],
+    vars_map: dict[str, str],
+    json_doc: dict[str, Any],
+    prefix: str,
+) -> dict[str, Any]:
+    filled_patch = {}
+    for path, json_value in patch_add.items():
+        jv = json.dumps(json_value)
+        path = apply_path_vars(path, vars_map)
+        for var, val in vars_map.items():
+            jv = re.sub(r'(?<!\\)"\$' + var + '"', lambda m: json.dumps(val), jv)
+        filled_patch[path] = json.loads(jv)
+    # create missing objects in paths
+    prefix_pos = json_doc
+    for step in JsonPointer(prefix).get_parts()[:-1]:
+        prefix_pos = prefix_pos.setdefault(step, {})
+    for path, json_value in filled_patch.items():
+        if path:
+            path_pos = prefix_pos
+            steps = JsonPointer('/' + path).get_parts()
+            for i, step in enumerate(steps):
+                if steps[-1:] == ['-'] and i == len(steps) - 2:
+                    path_pos = path_pos.setdefault(step, [])
+                    break
+                path_pos = path_pos.setdefault(step, {})
+    operations = [
+        {'op': 'add', 'path': prefix + path, 'value': json_value}
+        for path, json_value in filled_patch.items()
+    ]
+    return JsonPatch(operations).apply(json_doc)
+def apply_table_json_patch(
+    row_patch_add: dict[str:Any],
+    row_submatch: dict[str:Any] | None,
+    result: TableMatchResult,
+    json_doc: dict[str, Any],
+    prefix: str,
+) -> dict[str, Any]:
+    # FIXME: assuming cells are always a single RawText
+    for row in result.table_data:
+        vars_map = {
+            f'{i + 1}': cell.children[0].content for i, cell in enumerate(row.children)
+        }
+        json_doc = apply_json_patch(row_patch_add, vars_map, json_doc, prefix)
+        for rm, rules in (row_submatch or {}).items():
+            src = apply_path_vars(rm, vars_map)
+            for rule in rules:
+                if flt := rule.get('filter_match'):
+                    try:
+                        content_vars = match_content({'content': None}, flt, src)
+                    except NoMatch:
+                        continue
+                    if cmatch := content_vars['content']:
+                        src = cmatch
+                    json_doc = apply_json_patch(
+                        rule['patch_add'], vars_map, json_doc, prefix
+                    )
+                elif mat := rule.get('match'):
+                    mvars = {var: None for var in rule['vars'] or []}
+                    try:
+                        mvars = match_content(mvars, mat, src)
+                    except NoMatch:
+                        continue
+                    json_doc = apply_json_patch(
+                        rule['patch_add'], {**vars_map, **mvars}, json_doc, prefix
+                    )
+    return json_doc
+class MatchBase:
+    def match_md(
+        self,
+        tokens: list[BlockToken],
+        pos: int,
+    ) -> MatchResult | None:
+        return match_block_tokens(self.doc, tokens, pos, self.vars_)
+    def patch(
+        self, result: MatchResult, json_doc: dict[str, Any], prefix: str
+    ) -> dict[str, Any]:
+        if self.patch_add:
+            return apply_json_patch(self.patch_add, result.vars_map, json_doc, prefix)
+        return json_doc
+@dataclass
+class Match(MatchBase):
+    doc: list[BlockToken]
+    patch_add: dict[str:Any]
+    vars_: list[str]
+@dataclass
+class TableMatch(MatchBase):
+    cols: list[str]
+    row_patch_add: dict[str:Any]
+    row_submatch: dict[str:Any] | None = None
+    @property
+    def vars_(self):
+        return [f'${n + 1}' for n in range(len(self.cols))]
+    def match_md(
+        self,
+        tokens: list[BlockToken],
+        pos: int,
+    ) -> TableMatchResult | None:
+        return match_table_columns(self.cols, self.row_submatch, tokens, pos)
+    def patch(
+        self, result: TableMatchResult, json_doc: dict[str, Any], prefix: str
+    ) -> dict[str, Any]:
+        return apply_table_json_patch(
+            self.row_patch_add,
+            self.row_submatch,
+            result,
+            json_doc,
+            prefix,
+        )
+@dataclass
+class HeadingMatch(MatchBase):
+    doc: list[BlockToken]
+    patch_path: str | None
+    patch_add: dict[str:Any] | None
+    vars_: list[str] | None
+    children: list[MatchBase] | None = None
+    sections: list[MatchBase] | None = None
+class UnknownMatch(Exception):
+    pass
+def parse_match(m):
+    if md := m.get('heading_match'):
+        doc = Document(md).children
+        match = HeadingMatch(doc, m.get('patch_path'), m.get('patch_add'), m.get('vars'))
+        if children := m.get('children'):
+            match.children = parse_matches(children)
+        if sections := m.get('sections'):
+            match.sections = parse_matches(sections)
+        return match
+    if md := m.get('match'):
+        doc = Document(md).children
+        return Match(doc, m.get('patch_add'), m.get('vars'))
+    if cols := m.get('table_match'):
+        return TableMatch(cols, m['row_patch_add'], m.get('row_submatch'))
+    raise UnknownMatch(m)
+def parse_matches(matches):
+    return [parse_match(m) for m in matches]

semantic_md-0.0.1/semantic_md/text.py ADDED Viewed

@@ -0,0 +1,38 @@
+import re
+class NoMatch(Exception):
+    pass
+def match_content(
+    vars_map: dict[str, str | None],
+    pattern: str,
+    content: str,
+) -> dict[str, str]:
+    """
+    Collect/apply vars_map to {key} values in pattern against
+    content.
+    Return newly collected values when pattern matches content
+    (empty dict is success with no values) otherwise raise NoMatch.
+    """
+    ep = r'^\s*' + re.escape(pattern) + r'\s*$'
+    # var names are restricted to [a-z][0-9]_
+    for var, val in vars_map.items():
+        if val is not None:
+            ep = ep.replace(r'\{' + var + r'\}', re.escape(val))
+            continue
+        ep = ep.replace(r'\{' + var + r'\}', '(?P<g' + var + '>.*)', 1)
+        ep = ep.replace(r'\{' + var + r'\}', '(?P=g' + var + ')')
+    m = re.match(ep, content)
+    if not m:
+        raise NoMatch()
+    return {
+        var: m.group('g' + var)
+        for var, val in vars_map.items()
+        if val is None and 'g' + var in m.groupdict()
+    }

semantic_md-0.0.1/tests/datadict.yml ADDED Viewed

@@ -0,0 +1,107 @@
+# Proposed SemanticMD-based data dictionary format
+#
+# Describes dataset metadata, resource metadata, tables and columns
+# See https://semanticmd.org for more information
+semantic-md-version: 0
+sections:
+- heading_match: |
+    # Dataset {id}
+  vars: [id]
+  patch_add:
+    datasets/$id: {}
+  patch_path: datasets/$id
+  children:
+  - match: |
+      {desc | md}
+    vars: [desc]
+    patch_add:
+      description: $desc
+  - table_match: [Resource, Table, Title]
+    row_patch_add:
+      resources/-: {"name": $1, "table": $2, "title": $3}
+- heading_match: |
+    # Table {id}
+  vars: [id]
+  patch_add:
+    tables/$id: {}
+  patch_path: tables/$id
+  children:
+  - table_match: [Column, Type, Label]
+    row_patch_add:
+      columns/$1/label: $3
+    row_submatch:
+      $2:
+        - filter_match: "required {content}"
+          patch_add:
+            columns/$1/required: true
+        - match: "{typename}"
+          vars: [typename]
+          patch_add:
+            columns/$1/type: $typename
+  - table_match: [Primary key]
+    row_patch_add:
+      primary_key/-: $1
+  sections:
+  - heading_match: |
+      ## Column {id}
+    vars: [id]
+    patch_path: columns/$id
+    children:
+    - match: |
+        {desc | md}
+      vars: [desc]
+      patch_add:
+        description: $desc
+    - match: |
+        ### Validation
+        Values >= {min}
+      vars: [min]
+      patch_add:
+        validation: {"gte": $min}
+    - match: |
+        ### Choices
+        - {vals | list}
+      vars: [vals]
+      patch_add:
+        choices: $vals
+- heading_match: |
+    # Resource {id}
+  vars: [id]
+  patch_path: resources/$id
+  children:
+  - match: |
+      {desc | md}
+    vars: [desc]
+    patch_add:
+      description: $desc
+  sections:
+  - heading_match: |
+      ## Statistics
+    children:
+    - table_match: [Column, Min, Max, Cardinality, Null Count]
+      row_patch_add:
+        columns/$1/statistics: {
+          "min": $2, "max": $3, "cardinality": $4, "null_count": $5,
+        }
+    - heading_match: |
+        ### Frequency for {col}
+      vars: [col]
+      patch_path: columns/$col
+      children:
+      - table_match: [Choice, Frequency]
+        row_patch_add:
+          frequency/$1: $2

semantic_md-0.0.1/tests/example.json ADDED Viewed

@@ -0,0 +1,93 @@
+{
+    "datasets": {
+        "311 NYC Data": {
+            "description": "311 NYC dataset example\n\nAnd some extra text for |md to match\n",
+            "resources": [
+                {
+                    "name": "NYC_311_SR_2010-2020-sample-1M.csv",
+                    "table": "311nyc",
+                    "title": "NYC 311 Data 1M row sample"
+                }
+            ]
+        }
+    },
+    "tables": {
+        "311nyc": {
+            "columns": {
+                "Unique Key": {
+                    "label": "Record Identifier",
+                    "required": true,
+                    "type": "integer",
+                    "description": "A unique numeric identifier for each complaint record.\n\nIt is the primary key in the dataset and has 1,000,000 distinct values (100% uniqueness).\n",
+                    "validation": {
+                        "gte": "1"
+                    }
+                },
+                "Created Date": {
+                    "label": "Complaint Creation Timestamp",
+                    "required": true,
+                    "type": "timestamp",
+                    "description": "UTC timestamp indicating when a 311 service request was logged.\n\nThe dates span from January\u202f1\u202f2010 to December\u202f23\u202f2020 with a mean around November\u202f10\u202f2015. Approximately 84% of records have missing values.\n"
+                },
+                "Status": {
+                    "label": "Complaint Status",
+                    "required": true,
+                    "type": "text",
+                    "description": "Current processing status\u2014Closed, Pending, Open, etc.\n\nClosed complaints dominate (~95\u202f%), with small percentages remaining pending or open.\n",
+                    "choices": [
+                        "Assigned",
+                        "Closed",
+                        "Closed - Testing",
+                        "Email Sent",
+                        "In Progress",
+                        "Open",
+                        "Pending",
+                        "Started",
+                        "Unassigned",
+                        "Unspecified"
+                    ]
+                }
+            },
+            "primary_key": [
+                "Unique Key"
+            ]
+        }
+    },
+    "resources": {
+        "NYC_311_SR_2010-2020-sample-1M.csv": {
+            "columns": {
+                "Unique Key": {
+                    "statistics": {
+                        "min": "11465364",
+                        "max": "48478173",
+                        "cardinality": "1000000",
+                        "null_count": "0"
+                    }
+                },
+                "Created Date": {
+                    "statistics": {
+                        "min": "2010-01-01T00:00:00+00:00",
+                        "max": "2020-12-23T01:25:51+00:00",
+                        "cardinality": "841014",
+                        "null_count": "0"
+                    }
+                },
+                "Status": {
+                    "statistics": {
+                        "min": "Assigned",
+                        "max": "Unspecified",
+                        "cardinality": "10",
+                        "null_count": "0"
+                    },
+                    "frequency": {
+                        "Assigned": "6651",
+                        "In Progress": "7841",
+                        "Open": "12340",
+                        "Pending": "20119"
+                    }
+                }
+            }
+        }
+    }
+}

semantic_md-0.0.1/tests/example.md ADDED Viewed

@@ -0,0 +1,79 @@
+---
+semantic-md: datadict.yml
+---
+# Dataset 311 NYC Data
+311 NYC dataset example
+And some extra text for |md to match
+| Resource | Table | Title |
+| --- | --- | --- |
+| NYC_311_SR_2010-2020-sample-1M.csv | 311nyc | NYC 311 Data 1M row sample |
+# Table 311nyc
+| Column | Type | Label |
+| --- | --- | --- |
+| Unique Key | required integer | Record Identifier |
+| Created Date | required timestamp | Complaint Creation Timestamp |
+| Status | required text | Complaint Status |
+| Primary key |
+| --- |
+| Unique Key |
+## Column Unique Key
+A unique numeric identifier for each complaint record.
+It is the primary key in the dataset and has 1,000,000 distinct values (100% uniqueness).
+### Validation
+Values >= 1
+## Column Created Date
+UTC timestamp indicating when a 311 service request was logged.
+The dates span from January 1 2010 to December 23 2020 with a mean around November 10 2015. Approximately 84% of records have missing values.
+## Column Status
+Current processing status—Closed, Pending, Open, etc.
+Closed complaints dominate (~95 %), with small percentages remaining pending or open.
+### Choices
+- Assigned
+- Closed
+- Closed - Testing
+- Email Sent
+- In Progress
+- Open
+- Pending
+- Started
+- Unassigned
+- Unspecified
+# Resource NYC_311_SR_2010-2020-sample-1M.csv
+## Statistics
+| Column | Min | Max | Cardinality | Null Count |
+| --- | ---: | ---: | ---: | ---: |
+| Unique Key | 11465364 | 48478173 | 1000000 | 0 |
+| Created Date | 2010-01-01T00:00:00+00:00 | 2020-12-23T01:25:51+00:00 | 841014 | 0 |
+| Status | Assigned | Unspecified | 10 | 0 |
+### Frequency for Status
+| Choice | Frequency |
+| :--- | ---: |
+| Pending | 20119 |
+| Open | 12340 |
+| In Progress | 7841 |
+| Assigned | 6651 |

semantic_md-0.0.1/tests/test_datadict.py ADDED Viewed

@@ -0,0 +1,24 @@
+import os
+import json
+from yaml import safe_load
+from semantic_md import convert
+HERE = os.path.split(__file__)[0]
+def test1():
+    f = open(os.path.join(HERE, 'example.md')).read()
+    front, body = convert.md_parse_front_matter(f)
+    assert front['semantic-md'] == 'datadict.yml'
+    schema = safe_load(open(os.path.join(HERE, 'datadict.yml')))
+    parsed = convert.md_parse_body(body, schema)
+    s = open(os.path.join(HERE, 'datadict.yml')).read()
+    schema = convert.Schema.read(s)
+    out = convert.to_json(parsed, schema)
+    assert out == json.load(open(os.path.join(HERE, 'example.json')))