mdbind 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbind/__init__.py +0 -0
- mdbind/cache.py +170 -0
- mdbind/cli.py +1181 -0
- mdbind/composer.py +135 -0
- mdbind/cycle.py +24 -0
- mdbind/directives.py +116 -0
- mdbind/index.py +57 -0
- mdbind/models.py +86 -0
- mdbind/parser.py +241 -0
- mdbind-0.1.0.dist-info/METADATA +9 -0
- mdbind-0.1.0.dist-info/RECORD +14 -0
- mdbind-0.1.0.dist-info/WHEEL +5 -0
- mdbind-0.1.0.dist-info/entry_points.txt +2 -0
- mdbind-0.1.0.dist-info/top_level.txt +1 -0
mdbind/composer.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Materializacao semantica: motor de composicao documental (spec section 8.3).
|
|
3
|
+
"""
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import FrozenSet, List, Optional, Set
|
|
9
|
+
|
|
10
|
+
from mdbind.cycle import enter_node, would_cycle
|
|
11
|
+
from mdbind.directives import _resolve_uri
|
|
12
|
+
from mdbind.models import SectionGraph
|
|
13
|
+
|
|
14
|
+
_PLACEHOLDER_TPL = "<!-- mdgraph:unresolved uri=\"{uri}\" -->"
|
|
15
|
+
_INCLUDE_RE = re.compile(r"^\[@include(?::[^\]]*)?\]\(([^)]+)\)\s*$")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def compose(
|
|
19
|
+
root_uri: str,
|
|
20
|
+
graph: SectionGraph,
|
|
21
|
+
*,
|
|
22
|
+
strict: bool = False,
|
|
23
|
+
deduplicate: bool = False,
|
|
24
|
+
warnings: Optional[List[str]] = None,
|
|
25
|
+
depth: Optional[int] = None,
|
|
26
|
+
) -> str:
|
|
27
|
+
if warnings is None:
|
|
28
|
+
warnings = []
|
|
29
|
+
|
|
30
|
+
root_section = graph.index.get(root_uri)
|
|
31
|
+
if root_section is None:
|
|
32
|
+
raise ValueError(f"URI raiz nao encontrada: '{root_uri}'")
|
|
33
|
+
|
|
34
|
+
# O no raiz e sempre renormalizado para heading level 1
|
|
35
|
+
initial_offset = 1 - root_section.raw.heading_level
|
|
36
|
+
|
|
37
|
+
seen: Set[str] = set()
|
|
38
|
+
lines = _compose_node(
|
|
39
|
+
root_uri, graph,
|
|
40
|
+
heading_offset=initial_offset,
|
|
41
|
+
execution_path=frozenset(),
|
|
42
|
+
seen=seen,
|
|
43
|
+
strict=strict,
|
|
44
|
+
deduplicate=deduplicate,
|
|
45
|
+
warnings=warnings,
|
|
46
|
+
depth=depth,
|
|
47
|
+
)
|
|
48
|
+
return "\n".join(lines)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _compose_node(
|
|
52
|
+
uri: str,
|
|
53
|
+
graph: SectionGraph,
|
|
54
|
+
heading_offset: int,
|
|
55
|
+
execution_path: FrozenSet[str],
|
|
56
|
+
seen: Set[str],
|
|
57
|
+
strict: bool,
|
|
58
|
+
deduplicate: bool,
|
|
59
|
+
warnings: List[str],
|
|
60
|
+
depth: Optional[int] = None,
|
|
61
|
+
) -> List[str]:
|
|
62
|
+
section = graph.index.get(uri)
|
|
63
|
+
if section is None:
|
|
64
|
+
msg = f"URI nao encontrada: '{uri}'"
|
|
65
|
+
if strict:
|
|
66
|
+
raise ValueError(msg)
|
|
67
|
+
warnings.append(msg)
|
|
68
|
+
return [_PLACEHOLDER_TPL.format(uri=uri)]
|
|
69
|
+
|
|
70
|
+
if deduplicate and uri in seen:
|
|
71
|
+
return [f"@ref({uri})"]
|
|
72
|
+
|
|
73
|
+
seen.add(uri)
|
|
74
|
+
execution_path = enter_node(uri, execution_path)
|
|
75
|
+
|
|
76
|
+
raw_lines = _raw_lines(section)
|
|
77
|
+
result: List[str] = []
|
|
78
|
+
|
|
79
|
+
for line in raw_lines:
|
|
80
|
+
adjusted = _adjust_heading(line, heading_offset)
|
|
81
|
+
m = _INCLUDE_RE.match(adjusted.strip())
|
|
82
|
+
if m:
|
|
83
|
+
# Resolver URI relativa ao arquivo de origem da secao
|
|
84
|
+
raw_target = m.group(1).strip()
|
|
85
|
+
resolved_target = _resolve_uri(raw_target, section.file_path)
|
|
86
|
+
|
|
87
|
+
if would_cycle(resolved_target, execution_path):
|
|
88
|
+
warnings.append(
|
|
89
|
+
f"Ciclo detectado: '{resolved_target}' ja esta no caminho "
|
|
90
|
+
f"de execucao. Aresta rompida."
|
|
91
|
+
)
|
|
92
|
+
continue # rompe silenciosamente
|
|
93
|
+
|
|
94
|
+
# Verificar limite de profundidade
|
|
95
|
+
if depth is not None and depth <= 0:
|
|
96
|
+
continue # nao expande, descarta a linha de @include
|
|
97
|
+
|
|
98
|
+
child_offset = heading_offset # fallback se filho nao encontrado
|
|
99
|
+
child_section_lookup = graph.index.get(resolved_target)
|
|
100
|
+
if child_section_lookup is not None:
|
|
101
|
+
parent_new_level = section.raw.heading_level + heading_offset
|
|
102
|
+
child_offset = parent_new_level + 1 - child_section_lookup.raw.heading_level
|
|
103
|
+
|
|
104
|
+
child_lines = _compose_node(
|
|
105
|
+
resolved_target, graph,
|
|
106
|
+
heading_offset=child_offset,
|
|
107
|
+
execution_path=execution_path,
|
|
108
|
+
seen=seen,
|
|
109
|
+
strict=strict,
|
|
110
|
+
deduplicate=deduplicate,
|
|
111
|
+
warnings=warnings,
|
|
112
|
+
depth=None if depth is None else depth - 1,
|
|
113
|
+
)
|
|
114
|
+
result.extend(child_lines)
|
|
115
|
+
else:
|
|
116
|
+
result.append(adjusted)
|
|
117
|
+
|
|
118
|
+
return result
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _raw_lines(section) -> List[str]:
|
|
122
|
+
path = Path(section.file_path)
|
|
123
|
+
all_lines = path.read_text(encoding="utf-8").splitlines()
|
|
124
|
+
start = section.raw.source_start_line - 1
|
|
125
|
+
end = section.raw.source_end_line
|
|
126
|
+
return all_lines[start:end]
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _adjust_heading(line: str, offset: int) -> str:
|
|
130
|
+
if offset == 0 or not line.startswith("#"):
|
|
131
|
+
return line
|
|
132
|
+
original_level = len(line) - len(line.lstrip("#"))
|
|
133
|
+
new_level = max(1, original_level + offset)
|
|
134
|
+
return "#" * new_level + line[original_level:]
|
|
135
|
+
|
mdbind/cycle.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deteccao e resolucao de ciclos durante a materializacao (spec section 5).
|
|
3
|
+
|
|
4
|
+
O digrafo pode conter ciclos nativamente. Durante o compose, o motor rastreia
|
|
5
|
+
o caminho de execucao atual P (stack de URIs). Se uma aresta de inclusao (x, y)
|
|
6
|
+
for avaliada e y ∈ P, o ciclo e detectado e a aresta e rompida silenciosamente.
|
|
7
|
+
|
|
8
|
+
O SectionGraph original NAO e modificado; apenas a materializacao e afetada.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import FrozenSet
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def would_cycle(uri: str, execution_path: FrozenSet[str]) -> bool:
|
|
16
|
+
"""
|
|
17
|
+
Retorna True se incluir `uri` no caminho atual criaria um ciclo.
|
|
18
|
+
"""
|
|
19
|
+
return uri in execution_path
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def enter_node(uri: str, execution_path: FrozenSet[str]) -> FrozenSet[str]:
|
|
23
|
+
"""Retorna um novo caminho com `uri` adicionado."""
|
|
24
|
+
return execution_path | {uri}
|
mdbind/directives.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Etapa 4 do pipeline: tokenizacao de diretivas.
|
|
3
|
+
|
|
4
|
+
Varre os tokens de conteudo de uma ParsedSection ja delimitada e converte
|
|
5
|
+
marcacoes semanticas (@ref, @include, @query) em objetos Directive tipados,
|
|
6
|
+
resolvendo URIs relativas ao arquivo de origem.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List
|
|
13
|
+
|
|
14
|
+
from mdbind.models import Directive, ParsedSection
|
|
15
|
+
|
|
16
|
+
# Regex que captura [@tipo: label](uri) ou [@tipo](uri)
|
|
17
|
+
# Grupo 1: tipo (ref|include|query)
|
|
18
|
+
# Grupo 2: label opcional (pode ser vazio ou ausente)
|
|
19
|
+
# Grupo 3: uri de destino
|
|
20
|
+
_DIRECTIVE_RE = re.compile(r"\[@(ref|include|query)(?::\s*([^\]]*))?\]\(([^)]+)\)")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _resolve_uri(target: str, source_file_path: str) -> str:
|
|
24
|
+
"""
|
|
25
|
+
Resolve um target URI relativo ao diretorio do arquivo de origem.
|
|
26
|
+
|
|
27
|
+
Se o target nao tiver componente de caminho (ex: apenas "#id"), retorna
|
|
28
|
+
como esta. Se for absoluto ou ja normalizado, retorna normalizado.
|
|
29
|
+
Fragmentos (#id) sao preservados.
|
|
30
|
+
"""
|
|
31
|
+
if not target:
|
|
32
|
+
return target
|
|
33
|
+
|
|
34
|
+
# Separar caminho e fragmento
|
|
35
|
+
if "#" in target:
|
|
36
|
+
path_part, fragment = target.split("#", 1)
|
|
37
|
+
fragment = "#" + fragment
|
|
38
|
+
else:
|
|
39
|
+
path_part, fragment = target, ""
|
|
40
|
+
|
|
41
|
+
if not path_part:
|
|
42
|
+
# Referencia ao proprio arquivo: "#id"
|
|
43
|
+
return source_file_path + fragment
|
|
44
|
+
|
|
45
|
+
p = Path(path_part)
|
|
46
|
+
if p.is_absolute():
|
|
47
|
+
return str(p) + fragment
|
|
48
|
+
|
|
49
|
+
# Resolver relativo ao diretorio do arquivo de origem
|
|
50
|
+
source_dir = Path(source_file_path).parent
|
|
51
|
+
resolved = (source_dir / p).resolve()
|
|
52
|
+
return str(resolved) + fragment
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# Regex para o texto do link: @tipo ou @tipo: label
|
|
56
|
+
_LINK_TEXT_RE = re.compile(r"^@(ref|include|query)(?::\s*(.*))?$")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def extract_directives(tokens: list, source_file_path: str) -> List[Directive]:
|
|
60
|
+
"""
|
|
61
|
+
Varre tokens buscando diretivas na sintaxe de link Markdown:
|
|
62
|
+
[@tipo: label](uri) ou [@tipo](uri)
|
|
63
|
+
|
|
64
|
+
Em markdown-it-py, esse padrao e tokenizado como:
|
|
65
|
+
link_open (attrs: [["href", uri]])
|
|
66
|
+
text (content: "@tipo: label")
|
|
67
|
+
link_close
|
|
68
|
+
|
|
69
|
+
A extracao examina cada link_open dentro de tokens `inline` e verifica
|
|
70
|
+
se o texto filho corresponde ao padrao de diretiva.
|
|
71
|
+
"""
|
|
72
|
+
directives: List[Directive] = []
|
|
73
|
+
|
|
74
|
+
for tok in tokens:
|
|
75
|
+
if tok.type != "inline" or not tok.children:
|
|
76
|
+
continue
|
|
77
|
+
children = tok.children
|
|
78
|
+
i = 0
|
|
79
|
+
while i < len(children):
|
|
80
|
+
child = children[i]
|
|
81
|
+
if child.type == "link_open":
|
|
82
|
+
attrs = child.attrs or {}
|
|
83
|
+
href = attrs.get("href", "") if isinstance(attrs, dict) else ""
|
|
84
|
+
# Proximo filho deve ser o texto do link
|
|
85
|
+
if i + 1 < len(children) and children[i + 1].type == "text":
|
|
86
|
+
link_text = children[i + 1].content.strip()
|
|
87
|
+
m = _LINK_TEXT_RE.match(link_text)
|
|
88
|
+
if m and href:
|
|
89
|
+
dtype = m.group(1)
|
|
90
|
+
raw_label = m.group(2)
|
|
91
|
+
label = raw_label.strip() if raw_label else None
|
|
92
|
+
resolved = _resolve_uri(href.strip(), source_file_path)
|
|
93
|
+
directives.append(
|
|
94
|
+
Directive(type=dtype, target_uri=resolved, label=label) # type: ignore[arg-type]
|
|
95
|
+
)
|
|
96
|
+
i += 1
|
|
97
|
+
|
|
98
|
+
return directives
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def bind_directives(section: ParsedSection, tokens: list) -> ParsedSection:
|
|
102
|
+
"""
|
|
103
|
+
Retorna uma nova ParsedSection com o campo directives populado.
|
|
104
|
+
Varre apenas os tokens internos da secao (excluindo sub-secoes).
|
|
105
|
+
"""
|
|
106
|
+
# Tokens internos: apos heading_open, inline, heading_close; para antes do proximo heading
|
|
107
|
+
all_inner = tokens[section.raw.token_start + 3: section.raw.token_end + 1]
|
|
108
|
+
inner: list = []
|
|
109
|
+
for tok in all_inner:
|
|
110
|
+
if tok.type == "heading_open":
|
|
111
|
+
break
|
|
112
|
+
inner.append(tok)
|
|
113
|
+
|
|
114
|
+
directives = extract_directives(inner, section.file_path)
|
|
115
|
+
|
|
116
|
+
return section.model_copy(update={"directives": directives})
|
mdbind/index.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Etapa 5 do pipeline: indexacao do repositorio e construcao do SectionGraph.
|
|
3
|
+
|
|
4
|
+
index_repository(root_path) -> SectionGraph
|
|
5
|
+
- Descobre recursivamente todos os .md no diretorio raiz
|
|
6
|
+
- Executa o pipeline parser.parse_file em cada arquivo (com cache incremental)
|
|
7
|
+
- Registra secoes no SectionIndex
|
|
8
|
+
- Constroi arestas bidirecionais no SectionGraph
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from mdbind.cache import build_index_with_cache, save_cache, serialize_section
|
|
15
|
+
from mdbind.models import SectionGraph, SectionIndex
|
|
16
|
+
from mdbind.parser import ParseError
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def index_repository(
|
|
20
|
+
root_path: str | Path,
|
|
21
|
+
*,
|
|
22
|
+
no_cache: bool = False,
|
|
23
|
+
persist_cache: bool = False,
|
|
24
|
+
) -> SectionGraph:
|
|
25
|
+
"""
|
|
26
|
+
Varre recursivamente root_path buscando arquivos .md,
|
|
27
|
+
parseia cada um e monta o SectionGraph completo em memoria.
|
|
28
|
+
|
|
29
|
+
Parametros:
|
|
30
|
+
no_cache: ignorar cache existente e reprocessar tudo
|
|
31
|
+
persist_cache: gravar cache em .mdgraph/index.json apos indexar
|
|
32
|
+
|
|
33
|
+
Raises ParseError se houver URIs duplicadas.
|
|
34
|
+
"""
|
|
35
|
+
root = Path(root_path).resolve()
|
|
36
|
+
md_files = sorted(root.rglob("*.md"))
|
|
37
|
+
|
|
38
|
+
sections, file_hashes = build_index_with_cache(root, md_files, no_cache=no_cache)
|
|
39
|
+
|
|
40
|
+
index = SectionIndex()
|
|
41
|
+
graph = SectionGraph(index=index)
|
|
42
|
+
|
|
43
|
+
for section in sections:
|
|
44
|
+
try:
|
|
45
|
+
index.add(section)
|
|
46
|
+
except ValueError as exc:
|
|
47
|
+
raise ParseError(str(exc)) from exc
|
|
48
|
+
|
|
49
|
+
for directive in section.directives:
|
|
50
|
+
if directive.type in ("ref", "include"):
|
|
51
|
+
graph.add_edge(section.uri, directive.target_uri)
|
|
52
|
+
|
|
53
|
+
if persist_cache:
|
|
54
|
+
sections_data = [serialize_section(s) for s in index.sections.values()]
|
|
55
|
+
save_cache(root, file_hashes, sections_data)
|
|
56
|
+
|
|
57
|
+
return graph
|
mdbind/models.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Any, Dict, List, Literal, Set
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field, model_validator
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# ---------------------------------------------------------------------------
|
|
10
|
+
# Fase 2: Delimitacao Fisica
|
|
11
|
+
# ---------------------------------------------------------------------------
|
|
12
|
+
|
|
13
|
+
class RawSection(BaseModel):
|
|
14
|
+
"""Resolve apenas o escopo espacial da secao na AST e no arquivo fonte."""
|
|
15
|
+
|
|
16
|
+
heading_level: int
|
|
17
|
+
heading_text: str
|
|
18
|
+
token_start: int
|
|
19
|
+
token_end: int
|
|
20
|
+
source_start_line: int
|
|
21
|
+
source_end_line: int
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
# Fase 4: Semantica e Diretivas
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
class Directive(BaseModel):
|
|
29
|
+
"""Diretivas deixam de ser texto e se tornam nos logicos."""
|
|
30
|
+
|
|
31
|
+
type: Literal["ref", "include", "query"]
|
|
32
|
+
target_uri: str
|
|
33
|
+
label: str | None = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ParsedSection(BaseModel):
|
|
37
|
+
"""Resolve o significado. Amarra o espaco fisico aos metadados e referencias."""
|
|
38
|
+
|
|
39
|
+
raw: RawSection
|
|
40
|
+
uri: str
|
|
41
|
+
file_path: str
|
|
42
|
+
# metadata deve conter obrigatoriamente a chave 'id'
|
|
43
|
+
metadata: Dict[str, Any]
|
|
44
|
+
directives: List[Directive] = Field(default_factory=list)
|
|
45
|
+
|
|
46
|
+
@model_validator(mode="after")
|
|
47
|
+
def _require_id_in_metadata(self) -> "ParsedSection":
|
|
48
|
+
if "id" not in self.metadata:
|
|
49
|
+
raise ValueError("secao sem payload obrigatorio: campo 'id' ausente em metadata")
|
|
50
|
+
return self
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
# Fase 5: Indexacao e Grafo
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
|
|
57
|
+
class SectionIndex(BaseModel):
|
|
58
|
+
"""Repositorio de acesso em O(1) de secoes ja parseadas."""
|
|
59
|
+
|
|
60
|
+
sections: Dict[str, ParsedSection] = Field(default_factory=dict)
|
|
61
|
+
|
|
62
|
+
def add(self, section: ParsedSection) -> None:
|
|
63
|
+
if section.uri in self.sections:
|
|
64
|
+
raise ValueError(f"URI duplicada no indice: '{section.uri}'")
|
|
65
|
+
self.sections[section.uri] = section
|
|
66
|
+
|
|
67
|
+
def get(self, uri: str) -> ParsedSection | None:
|
|
68
|
+
return self.sections.get(uri)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class SectionGraph(BaseModel):
|
|
72
|
+
"""Gestao topologica de dependencias (Backlinks suportados)."""
|
|
73
|
+
|
|
74
|
+
index: SectionIndex = Field(default_factory=SectionIndex)
|
|
75
|
+
outgoing_edges: Dict[str, Set[str]] = Field(
|
|
76
|
+
default_factory=lambda: defaultdict(set)
|
|
77
|
+
)
|
|
78
|
+
incoming_edges: Dict[str, Set[str]] = Field(
|
|
79
|
+
default_factory=lambda: defaultdict(set)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
83
|
+
|
|
84
|
+
def add_edge(self, source_uri: str, target_uri: str) -> None:
|
|
85
|
+
self.outgoing_edges[source_uri].add(target_uri)
|
|
86
|
+
self.incoming_edges[target_uri].add(source_uri)
|
mdbind/parser.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Parser Markdown: pipeline Markdown -> AST -> RawSection -> ParsedSection.
|
|
3
|
+
|
|
4
|
+
Etapas cobertas (spec section 2):
|
|
5
|
+
1. Geracao de AST via markdown-it-py
|
|
6
|
+
2. Section Discovery -> RawSection
|
|
7
|
+
3. Metadata Binding -> ParsedSection
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List
|
|
13
|
+
|
|
14
|
+
import yaml
|
|
15
|
+
from markdown_it import MarkdownIt
|
|
16
|
+
|
|
17
|
+
from mdbind.directives import bind_directives
|
|
18
|
+
from mdbind.models import Directive, ParsedSection, RawSection
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# Erros de parsing
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
class ParseError(Exception):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
# Etapa 1: Geracao de AST
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
def _tokenize(text: str) -> list:
|
|
34
|
+
md = MarkdownIt()
|
|
35
|
+
return md.parse(text)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# Etapa 2: Section Discovery -> List[RawSection]
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
def _discover_sections(tokens: list) -> List[RawSection]:
|
|
43
|
+
"""
|
|
44
|
+
Varre a lista plana de tokens e delimita secoes por heading_open.
|
|
45
|
+
Uma secao vai do seu heading_open ate o proximo heading_open de nivel <= ao seu,
|
|
46
|
+
ou ate o fim do documento.
|
|
47
|
+
"""
|
|
48
|
+
# Coletar posicoes dos headings
|
|
49
|
+
heading_positions = []
|
|
50
|
+
for i, tok in enumerate(tokens):
|
|
51
|
+
if tok.type == "heading_open":
|
|
52
|
+
level = int(tok.tag[1]) # "h1" -> 1, "h2" -> 2, etc.
|
|
53
|
+
# source_start_line: markdown-it usa base-0, convertemos para base-1
|
|
54
|
+
source_line = (tok.map[0] + 1) if tok.map else 0
|
|
55
|
+
heading_positions.append((i, level, source_line))
|
|
56
|
+
|
|
57
|
+
raws: List[RawSection] = []
|
|
58
|
+
for idx, (token_start, level, source_start_line) in enumerate(heading_positions):
|
|
59
|
+
# Texto do heading: token seguinte e heading_content, proximo e heading_close
|
|
60
|
+
heading_text_tok = tokens[token_start + 1]
|
|
61
|
+
heading_text = heading_text_tok.children[0].content if heading_text_tok.children else ""
|
|
62
|
+
|
|
63
|
+
# Determinar token_end e source_end_line
|
|
64
|
+
token_end = len(tokens) - 1
|
|
65
|
+
source_end_line = _last_source_line(tokens)
|
|
66
|
+
|
|
67
|
+
for future_start, future_level, future_source in heading_positions[idx + 1:]:
|
|
68
|
+
if future_level <= level:
|
|
69
|
+
# A proxima secao de mesmo nivel ou superior encerra esta
|
|
70
|
+
token_end = future_start - 1
|
|
71
|
+
source_end_line = future_source - 1
|
|
72
|
+
break
|
|
73
|
+
|
|
74
|
+
raws.append(RawSection(
|
|
75
|
+
heading_level=level,
|
|
76
|
+
heading_text=heading_text,
|
|
77
|
+
token_start=token_start,
|
|
78
|
+
token_end=token_end,
|
|
79
|
+
source_start_line=source_start_line,
|
|
80
|
+
source_end_line=source_end_line,
|
|
81
|
+
))
|
|
82
|
+
|
|
83
|
+
return raws
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _last_source_line(tokens: list) -> int:
|
|
87
|
+
"""Retorna a ultima linha fonte referenciada nos tokens (base-1)."""
|
|
88
|
+
last = 1
|
|
89
|
+
for tok in reversed(tokens):
|
|
90
|
+
if tok.map:
|
|
91
|
+
last = tok.map[1] # map[1] ja e o indice exclusivo (base-0), vira base-1
|
|
92
|
+
break
|
|
93
|
+
return last
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
# Etapa 3: Metadata Binding -> ParsedSection
|
|
98
|
+
# ---------------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
def _bind_metadata(
|
|
101
|
+
raw: RawSection,
|
|
102
|
+
tokens: list,
|
|
103
|
+
file_path: str,
|
|
104
|
+
) -> ParsedSection:
|
|
105
|
+
"""
|
|
106
|
+
Analisa os tokens internos da RawSection buscando o bloco 'section' (YAML).
|
|
107
|
+
Aplica as validacoes da spec section 3.
|
|
108
|
+
"""
|
|
109
|
+
# +3 pula: heading_open, inline (texto), heading_close
|
|
110
|
+
# O scan termina no primeiro heading interno (qualquer nivel), pois o bloco
|
|
111
|
+
# section so pode estar no conteudo direto da secao, nao em sub-secoes.
|
|
112
|
+
all_inner = tokens[raw.token_start + 3: raw.token_end + 1]
|
|
113
|
+
inner_tokens: list = []
|
|
114
|
+
for tok in all_inner:
|
|
115
|
+
if tok.type == "heading_open":
|
|
116
|
+
break
|
|
117
|
+
inner_tokens.append(tok)
|
|
118
|
+
|
|
119
|
+
section_blocks: list[str] = []
|
|
120
|
+
first_text_seen = False
|
|
121
|
+
section_block_index = -1 # posicao do primeiro bloco section nos inner_tokens
|
|
122
|
+
|
|
123
|
+
i = 0
|
|
124
|
+
while i < len(inner_tokens):
|
|
125
|
+
tok = inner_tokens[i]
|
|
126
|
+
|
|
127
|
+
if tok.type == "fence" and tok.info.strip() == "yaml":
|
|
128
|
+
parsed_yaml = None
|
|
129
|
+
try:
|
|
130
|
+
parsed_yaml = yaml.safe_load(tok.content) or {}
|
|
131
|
+
except yaml.YAMLError:
|
|
132
|
+
parsed_yaml = {}
|
|
133
|
+
if not isinstance(parsed_yaml, dict) or "section" not in parsed_yaml:
|
|
134
|
+
# Bloco yaml sem campo 'section' e ignorado (yaml generico)
|
|
135
|
+
first_text_seen = True
|
|
136
|
+
i += 1
|
|
137
|
+
continue
|
|
138
|
+
if first_text_seen and section_block_index == -1:
|
|
139
|
+
raise ParseError(
|
|
140
|
+
f"payload nao e o primeiro bloco na secao '{raw.heading_text}' "
|
|
141
|
+
f"(linha {raw.source_start_line})"
|
|
142
|
+
)
|
|
143
|
+
section_blocks.append(tok.content)
|
|
144
|
+
if section_block_index == -1:
|
|
145
|
+
section_block_index = i
|
|
146
|
+
elif tok.type in ("paragraph_open", "fence", "bullet_list_open",
|
|
147
|
+
"ordered_list_open", "blockquote_open", "html_block",
|
|
148
|
+
"table_open", "hr"):
|
|
149
|
+
# Qualquer bloco textual que nao seja o bloco section
|
|
150
|
+
if tok.type != "fence": # fence ja tratado acima
|
|
151
|
+
first_text_seen = True
|
|
152
|
+
elif tok.type == "inline" and tok.content.strip():
|
|
153
|
+
first_text_seen = True
|
|
154
|
+
|
|
155
|
+
i += 1
|
|
156
|
+
|
|
157
|
+
if len(section_blocks) > 1:
|
|
158
|
+
raise ParseError(
|
|
159
|
+
f"bloco section duplicado na secao '{raw.heading_text}' "
|
|
160
|
+
f"(linha {raw.source_start_line})"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
if not section_blocks:
|
|
164
|
+
# Secao sem bloco section: metadata vazio, sem erro
|
|
165
|
+
# Nao podemos construir ParsedSection pois falta 'id' — retornamos None
|
|
166
|
+
# para que o chamador decida se ignora ou errou
|
|
167
|
+
return None # type: ignore[return-value]
|
|
168
|
+
|
|
169
|
+
raw_yaml = section_blocks[0]
|
|
170
|
+
try:
|
|
171
|
+
metadata = yaml.safe_load(raw_yaml) or {}
|
|
172
|
+
except yaml.YAMLError as exc:
|
|
173
|
+
raise ParseError(
|
|
174
|
+
f"YAML invalido no bloco section da secao '{raw.heading_text}': {exc}"
|
|
175
|
+
) from exc
|
|
176
|
+
|
|
177
|
+
if not isinstance(metadata, dict):
|
|
178
|
+
raise ParseError(
|
|
179
|
+
f"bloco section da secao '{raw.heading_text}' nao e um mapeamento YAML valido"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
if not metadata.get("section"):
|
|
183
|
+
raise ParseError(
|
|
184
|
+
f"secao sem payload obrigatorio: campo 'section' ausente na secao "
|
|
185
|
+
f"'{raw.heading_text}' (linha {raw.source_start_line})"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
section_id = str(metadata.pop("section"))
|
|
189
|
+
metadata["id"] = section_id
|
|
190
|
+
uri = f"{file_path}#{section_id}"
|
|
191
|
+
|
|
192
|
+
return ParsedSection(
|
|
193
|
+
raw=raw,
|
|
194
|
+
uri=uri,
|
|
195
|
+
file_path=file_path,
|
|
196
|
+
metadata=metadata,
|
|
197
|
+
directives=[], # populado em B-003
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# ---------------------------------------------------------------------------
|
|
202
|
+
# API publica
|
|
203
|
+
# ---------------------------------------------------------------------------
|
|
204
|
+
|
|
205
|
+
def parse_file(file_path: str | Path) -> List[ParsedSection]:
|
|
206
|
+
"""
|
|
207
|
+
Executa o pipeline completo para um arquivo .md.
|
|
208
|
+
Retorna apenas as ParsedSections que possuem bloco section (com id).
|
|
209
|
+
Secoes sem bloco section sao silenciosamente ignoradas.
|
|
210
|
+
"""
|
|
211
|
+
path = Path(file_path)
|
|
212
|
+
text = path.read_text(encoding="utf-8")
|
|
213
|
+
return parse_text(text, file_path=str(path))
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def parse_text(text: str, file_path: str = "<string>") -> List[ParsedSection]:
|
|
217
|
+
"""
|
|
218
|
+
Executa o pipeline completo sobre texto Markdown bruto.
|
|
219
|
+
"""
|
|
220
|
+
tokens = _tokenize(text)
|
|
221
|
+
raws = _discover_sections(tokens)
|
|
222
|
+
|
|
223
|
+
seen_ids: set[str] = set()
|
|
224
|
+
sections: List[ParsedSection] = []
|
|
225
|
+
|
|
226
|
+
for raw in raws:
|
|
227
|
+
parsed = _bind_metadata(raw, tokens, file_path)
|
|
228
|
+
if parsed is None:
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
section_id = str(parsed.metadata["id"])
|
|
232
|
+
if section_id in seen_ids:
|
|
233
|
+
raise ParseError(
|
|
234
|
+
f"id duplicado '{section_id}' no arquivo '{file_path}'"
|
|
235
|
+
)
|
|
236
|
+
seen_ids.add(section_id)
|
|
237
|
+
# Etapa 4: tokenizar diretivas
|
|
238
|
+
parsed = bind_directives(parsed, tokens)
|
|
239
|
+
sections.append(parsed)
|
|
240
|
+
|
|
241
|
+
return sections
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
mdbind/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mdbind/cache.py,sha256=KrdOyrR1pmkyQ6C-FjNlvGDvVGHs7RBaw1mMBF3EPok,5570
|
|
3
|
+
mdbind/cli.py,sha256=SQoHZV1jHvyQ7U_Ejjfjj8iVkjE37cfu0h30GDBGW3k,40992
|
|
4
|
+
mdbind/composer.py,sha256=mtLCeNdXblAVHf5cEY0-U7s_03c4h_1is52BDOLC5Kw,4175
|
|
5
|
+
mdbind/cycle.py,sha256=nnqWZKrfI0TpFqhpXQYfm4g0FJgaWTXyaJIhuG3V_-s,821
|
|
6
|
+
mdbind/directives.py,sha256=Vj_wRxoCsg2vt6hIwv_hc2R0yyHQtZmCPU6oSI-DQdg,4047
|
|
7
|
+
mdbind/index.py,sha256=M25hkb5QHT0cGsX2ujgdN5qUi5JiKbrqYWvEV8Xadl0,1786
|
|
8
|
+
mdbind/models.py,sha256=W79gzwYi5trhLMmdUhVx48J_FZn8hcVYrpI6gIpPPl0,2817
|
|
9
|
+
mdbind/parser.py,sha256=MsNPgORBq7MpNxXY07Lbactorn6ij8trG9M-v_Kyh_Q,8422
|
|
10
|
+
mdbind-0.1.0.dist-info/METADATA,sha256=SFy2YjelbOKXLTAKHDIORN-J9_586WXjEG82bC6uvko,248
|
|
11
|
+
mdbind-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
12
|
+
mdbind-0.1.0.dist-info/entry_points.txt,sha256=dAbUX6TuxiskOOMPGEtnMkyEeD3ab6NTpBQmwVxADYw,39
|
|
13
|
+
mdbind-0.1.0.dist-info/top_level.txt,sha256=rmhkm853CHFBfq831bRqCJcDDCUETMCUYBb2ytOutjA,7
|
|
14
|
+
mdbind-0.1.0.dist-info/RECORD,,
|