flexdoc 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flexdoc/__init__.py +54 -0
- flexdoc/docs/__init__.py +155 -0
- flexdoc/docs/base_blocks.py +170 -0
- flexdoc/docs/block_info.py +167 -0
- flexdoc/docs/block_tree.py +142 -0
- flexdoc/docs/block_types.py +104 -0
- flexdoc/docs/collect.py +161 -0
- flexdoc/docs/debug.py +177 -0
- flexdoc/docs/doc_graph.py +288 -0
- flexdoc/docs/doc_graph_schema.json +219 -0
- flexdoc/docs/flex_doc.py +782 -0
- flexdoc/docs/frontmatter.py +77 -0
- flexdoc/docs/interval_index.py +73 -0
- flexdoc/docs/links.py +109 -0
- flexdoc/docs/node.py +165 -0
- flexdoc/docs/node_table.py +428 -0
- flexdoc/docs/paragraphs.py +395 -0
- flexdoc/docs/render.py +100 -0
- flexdoc/docs/search_tokens.py +83 -0
- flexdoc/docs/sections.py +130 -0
- flexdoc/docs/sizes.py +47 -0
- flexdoc/docs/span_ref.py +192 -0
- flexdoc/docs/token_diffs.py +367 -0
- flexdoc/docs/token_mapping.py +91 -0
- flexdoc/docs/wordtoks.py +284 -0
- flexdoc/html/__init__.py +74 -0
- flexdoc/html/extractor.py +30 -0
- flexdoc/html/html_in_md.py +510 -0
- flexdoc/html/html_plaintext.py +25 -0
- flexdoc/html/html_tags.py +423 -0
- flexdoc/html/timestamps.py +58 -0
- flexdoc/py.typed +0 -0
- flexdoc/util/__init__.py +11 -0
- flexdoc/util/read_time.py +59 -0
- flexdoc/util/token_estimate.py +34 -0
- flexdoc-0.1.0.dist-info/METADATA +101 -0
- flexdoc-0.1.0.dist-info/RECORD +39 -0
- flexdoc-0.1.0.dist-info/WHEEL +4 -0
- flexdoc-0.1.0.dist-info/licenses/LICENSE +21 -0
flexdoc/__init__.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
flexdoc is a source-grounded, layered document model for Markdown and text: parse to a
|
|
3
|
+
`FlexDoc`, query its structure across independent layers with `collect()`, serialize it
|
|
4
|
+
as a `DocGraph`, and anchor spans and edits with `SpanRef` so they survive reparse. It
|
|
5
|
+
is a standalone library (chopdiff builds its diff and windowed-transform layer on top
|
|
6
|
+
of it).
|
|
7
|
+
|
|
8
|
+
The root exports the working set for typical use — the entry point, the serialization
|
|
9
|
+
contract, the reference type, and the enums nearly every query or measurement needs:
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
from flexdoc import FlexDoc, NodeKind, TextUnit
|
|
13
|
+
|
|
14
|
+
doc = FlexDoc.from_text(markdown_text)
|
|
15
|
+
links = doc.collect(kinds={NodeKind.link}, recursive=True)
|
|
16
|
+
words = doc.size(TextUnit.words)
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
The full public surfaces live in the submodules:
|
|
20
|
+
|
|
21
|
+
- `flexdoc.docs` — `FlexDoc`, `Paragraph`, `Sentence`, `Section`, `Block`, `BlockType`,
|
|
22
|
+
the node table, `collect()`, `DocGraph`, `SpanRef` and its resolvers, render helpers
|
|
23
|
+
for source-linked HTML, token diffs/mappings, and word-token utilities.
|
|
24
|
+
- `flexdoc.html` — html-in-md, html/plaintext conversion, HTML tag helpers, the content
|
|
25
|
+
extractor, and timestamp extraction.
|
|
26
|
+
- `flexdoc.util` — read-time and token-count estimation.
|
|
27
|
+
|
|
28
|
+
Unit types (`Paragraph`, `Sentence`, `Section`, `Block`, `Node`) are reached from a
|
|
29
|
+
parsed `FlexDoc` rather than imported, so they stay in `flexdoc.docs`, as do functions
|
|
30
|
+
whose bare names need module context (e.g. `resolve`). Root additions are deliberate;
|
|
31
|
+
`tests/test_root_api.py` pins the exact surface.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from flexdoc.docs import (
|
|
35
|
+
BlockType,
|
|
36
|
+
Detail,
|
|
37
|
+
DocGraph,
|
|
38
|
+
FlexDoc,
|
|
39
|
+
Layer,
|
|
40
|
+
NodeKind,
|
|
41
|
+
SpanRef,
|
|
42
|
+
TextUnit,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"BlockType",
|
|
47
|
+
"Detail",
|
|
48
|
+
"DocGraph",
|
|
49
|
+
"FlexDoc",
|
|
50
|
+
"Layer",
|
|
51
|
+
"NodeKind",
|
|
52
|
+
"SpanRef",
|
|
53
|
+
"TextUnit",
|
|
54
|
+
]
|
flexdoc/docs/__init__.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# flake8: noqa: F401
|
|
2
|
+
|
|
3
|
+
from flexdoc.docs.base_blocks import BaseBlock, base_blocks
|
|
4
|
+
from flexdoc.docs.block_info import CodeInfo, ListInfo, TableInfo
|
|
5
|
+
from flexdoc.docs.block_tree import Block, parse_blocks, walk_blocks
|
|
6
|
+
from flexdoc.docs.block_types import BlockType, block_type_for
|
|
7
|
+
from flexdoc.docs.collect import collect
|
|
8
|
+
from flexdoc.docs.debug import (
|
|
9
|
+
doc_graph_yaml,
|
|
10
|
+
doc_report,
|
|
11
|
+
doc_report_data,
|
|
12
|
+
dump_views,
|
|
13
|
+
)
|
|
14
|
+
from flexdoc.docs.doc_graph import (
|
|
15
|
+
DEFAULT_INCLUDE,
|
|
16
|
+
Detail,
|
|
17
|
+
DocGraph,
|
|
18
|
+
NodeModel,
|
|
19
|
+
SourceInfo,
|
|
20
|
+
Views,
|
|
21
|
+
build_doc_graph,
|
|
22
|
+
)
|
|
23
|
+
from flexdoc.docs.flex_doc import FlexDoc
|
|
24
|
+
from flexdoc.docs.links import Link
|
|
25
|
+
from flexdoc.docs.node import LAYER_NESTING, Layer, NestingGuarantee, Node, NodeKind, NodeTable
|
|
26
|
+
from flexdoc.docs.node_table import build_node_table
|
|
27
|
+
from flexdoc.docs.paragraphs import Offsets, Paragraph, Sentence, SentIndex
|
|
28
|
+
from flexdoc.docs.render import parse_source_span_attr, render_node_attrs, wrap_with_node_attrs
|
|
29
|
+
from flexdoc.docs.search_tokens import search_tokens
|
|
30
|
+
from flexdoc.docs.sections import Section
|
|
31
|
+
from flexdoc.docs.sizes import TextUnit
|
|
32
|
+
from flexdoc.docs.span_ref import SpanRef, resolve, resolve_and_update
|
|
33
|
+
from flexdoc.docs.token_diffs import (
|
|
34
|
+
DIFF_FILTER_NONE,
|
|
35
|
+
DiffFilter,
|
|
36
|
+
DiffOp,
|
|
37
|
+
DiffStats,
|
|
38
|
+
OpType,
|
|
39
|
+
TokenDiff,
|
|
40
|
+
diff_docs,
|
|
41
|
+
diff_wordtoks,
|
|
42
|
+
scored_diff_wordtoks,
|
|
43
|
+
)
|
|
44
|
+
from flexdoc.docs.token_mapping import TokenMapping
|
|
45
|
+
from flexdoc.docs.wordtoks import (
|
|
46
|
+
BOF_STR,
|
|
47
|
+
BOF_TOK,
|
|
48
|
+
EOF_STR,
|
|
49
|
+
EOF_TOK,
|
|
50
|
+
PARA_BR_STR,
|
|
51
|
+
PARA_BR_TOK,
|
|
52
|
+
SENT_BR_STR,
|
|
53
|
+
SENT_BR_TOK,
|
|
54
|
+
SPACE_TOK,
|
|
55
|
+
SYMBOL_SEP,
|
|
56
|
+
Tag,
|
|
57
|
+
first_wordtok,
|
|
58
|
+
is_break_or_space,
|
|
59
|
+
is_div,
|
|
60
|
+
is_header_tag,
|
|
61
|
+
is_tag,
|
|
62
|
+
is_tag_close,
|
|
63
|
+
is_tag_open,
|
|
64
|
+
is_whitespace_or_punct,
|
|
65
|
+
is_word,
|
|
66
|
+
join_wordtoks,
|
|
67
|
+
normalize_wordtok,
|
|
68
|
+
wordtok_len,
|
|
69
|
+
wordtok_to_str,
|
|
70
|
+
wordtokenize,
|
|
71
|
+
wordtokenize_with_offsets,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
__all__ = [
|
|
75
|
+
"DEFAULT_INCLUDE",
|
|
76
|
+
"Detail",
|
|
77
|
+
"DocGraph",
|
|
78
|
+
"NodeModel",
|
|
79
|
+
"SourceInfo",
|
|
80
|
+
"Views",
|
|
81
|
+
"build_doc_graph",
|
|
82
|
+
"search_tokens",
|
|
83
|
+
"TextUnit",
|
|
84
|
+
"Block",
|
|
85
|
+
"BlockType",
|
|
86
|
+
"block_type_for",
|
|
87
|
+
"parse_blocks",
|
|
88
|
+
"walk_blocks",
|
|
89
|
+
"CodeInfo",
|
|
90
|
+
"ListInfo",
|
|
91
|
+
"TableInfo",
|
|
92
|
+
"Offsets",
|
|
93
|
+
"Link",
|
|
94
|
+
"Paragraph",
|
|
95
|
+
"Section",
|
|
96
|
+
"Sentence",
|
|
97
|
+
"SentIndex",
|
|
98
|
+
"FlexDoc",
|
|
99
|
+
"DIFF_FILTER_NONE",
|
|
100
|
+
"DiffFilter",
|
|
101
|
+
"DiffOp",
|
|
102
|
+
"DiffStats",
|
|
103
|
+
"OpType",
|
|
104
|
+
"TokenDiff",
|
|
105
|
+
"diff_docs",
|
|
106
|
+
"diff_wordtoks",
|
|
107
|
+
"scored_diff_wordtoks",
|
|
108
|
+
"TokenMapping",
|
|
109
|
+
"BOF_STR",
|
|
110
|
+
"BOF_TOK",
|
|
111
|
+
"EOF_STR",
|
|
112
|
+
"EOF_TOK",
|
|
113
|
+
"PARA_BR_STR",
|
|
114
|
+
"PARA_BR_TOK",
|
|
115
|
+
"SENT_BR_STR",
|
|
116
|
+
"SENT_BR_TOK",
|
|
117
|
+
"SPACE_TOK",
|
|
118
|
+
"SYMBOL_SEP",
|
|
119
|
+
"Tag",
|
|
120
|
+
"first_wordtok",
|
|
121
|
+
"is_break_or_space",
|
|
122
|
+
"is_div",
|
|
123
|
+
"is_header_tag",
|
|
124
|
+
"is_tag",
|
|
125
|
+
"is_tag_close",
|
|
126
|
+
"is_tag_open",
|
|
127
|
+
"is_whitespace_or_punct",
|
|
128
|
+
"is_word",
|
|
129
|
+
"join_wordtoks",
|
|
130
|
+
"normalize_wordtok",
|
|
131
|
+
"wordtok_len",
|
|
132
|
+
"wordtok_to_str",
|
|
133
|
+
"wordtokenize",
|
|
134
|
+
"wordtokenize_with_offsets",
|
|
135
|
+
"BaseBlock",
|
|
136
|
+
"base_blocks",
|
|
137
|
+
"collect",
|
|
138
|
+
"doc_report",
|
|
139
|
+
"doc_report_data",
|
|
140
|
+
"doc_graph_yaml",
|
|
141
|
+
"dump_views",
|
|
142
|
+
"LAYER_NESTING",
|
|
143
|
+
"Layer",
|
|
144
|
+
"NestingGuarantee",
|
|
145
|
+
"Node",
|
|
146
|
+
"NodeKind",
|
|
147
|
+
"NodeTable",
|
|
148
|
+
"build_node_table",
|
|
149
|
+
"SpanRef",
|
|
150
|
+
"resolve",
|
|
151
|
+
"resolve_and_update",
|
|
152
|
+
"render_node_attrs",
|
|
153
|
+
"wrap_with_node_attrs",
|
|
154
|
+
"parse_source_span_attr",
|
|
155
|
+
]
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sequential base-block partition of a Markdown document.
|
|
3
|
+
|
|
4
|
+
A base block is a unit of the flat, depth-annotated partition described in
|
|
5
|
+
flexdoc-spec section 6. The partition is ordered by source position, its spans are
|
|
6
|
+
non-overlapping, and together they cover every non-whitespace character of the
|
|
7
|
+
document exactly once (the gaps are inter-block and structural whitespace). It is the
|
|
8
|
+
view for block-by-block processing and resequencing.
|
|
9
|
+
|
|
10
|
+
Each base block retains its exact `source_span`, so exact source reconstruction is
|
|
11
|
+
available by slicing the source at those spans (or via the structural `blocks()` tree).
|
|
12
|
+
Reassembling the rendered base-block *text* is lossy for list-item continuation content:
|
|
13
|
+
list markers and continuation indentation are whitespace outside the trimmed spans, so a
|
|
14
|
+
naive text concatenation normalizes them. Reconstruct from offsets when exactness matters.
|
|
15
|
+
|
|
16
|
+
Leaf/atomic blocks (heading, paragraph, table, code, thematic_break, html, and
|
|
17
|
+
a whole blockquote) are each one base block. Lists decompose: each list item at every
|
|
18
|
+
nesting level is its own base block with increasing depth, and a list item's continuation
|
|
19
|
+
content (paragraphs after or between nested sublists) is emitted with its own real block
|
|
20
|
+
type at the item's depth — never relabeled `list_item`.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
|
|
27
|
+
from marko.element import Element
|
|
28
|
+
|
|
29
|
+
from flexdoc.docs.block_tree import Block, parse_blocks
|
|
30
|
+
from flexdoc.docs.block_types import BlockType
|
|
31
|
+
|
|
32
|
+
# Block types that are always atomic (never decomposed into child base blocks).
|
|
33
|
+
_ATOMIC_TYPES = frozenset(
|
|
34
|
+
{
|
|
35
|
+
BlockType.heading,
|
|
36
|
+
BlockType.paragraph,
|
|
37
|
+
BlockType.table,
|
|
38
|
+
BlockType.code,
|
|
39
|
+
BlockType.thematic_break,
|
|
40
|
+
BlockType.html,
|
|
41
|
+
BlockType.footnote,
|
|
42
|
+
BlockType.blockquote,
|
|
43
|
+
}
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Block types representing lists that decompose into list_item children.
|
|
47
|
+
_LIST_TYPES = frozenset({BlockType.list, BlockType.ordered_list})
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class BaseBlock:
|
|
52
|
+
"""
|
|
53
|
+
A single unit of the sequential base-block partition. Carries the underlying
|
|
54
|
+
`Block` (with its type, span, and children) and the `depth` indicating nesting
|
|
55
|
+
level (0 for top-level blocks, increasing for nested list items).
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
block: Block
|
|
59
|
+
depth: int
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def base_blocks(
|
|
63
|
+
text: str, *, item_partition_depth: int = 6, parsed: Element | None = None
|
|
64
|
+
) -> list[BaseBlock]:
|
|
65
|
+
"""
|
|
66
|
+
Produce the flat, depth-annotated sequential block partition.
|
|
67
|
+
|
|
68
|
+
- `item_partition_depth = N` (default 6): split list items down to N nesting
|
|
69
|
+
levels; content nested deeper stays whole inside its depth-N base block.
|
|
70
|
+
- `item_partition_depth = -1`: unlimited; split at every nesting level.
|
|
71
|
+
- `item_partition_depth = 0`: lists are not split; each list is one base block.
|
|
72
|
+
|
|
73
|
+
Blockquotes are always one base block regardless of depth.
|
|
74
|
+
|
|
75
|
+
`parsed` is the marko parse of `text`; pass it to reuse a shared parse, else `text`
|
|
76
|
+
is parsed here.
|
|
77
|
+
|
|
78
|
+
Invariants: the result is ordered by source position, spans are non-overlapping, and
|
|
79
|
+
together they cover every non-whitespace character exactly once. Exact source
|
|
80
|
+
reconstruction is via each block's `source_span` (not by concatenating block text;
|
|
81
|
+
see the module docstring for the continuation-content caveat).
|
|
82
|
+
"""
|
|
83
|
+
blocks = parse_blocks(text, parsed)
|
|
84
|
+
result: list[BaseBlock] = []
|
|
85
|
+
_collect_base_blocks(text, blocks, 0, item_partition_depth, result)
|
|
86
|
+
return result
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _collect_base_blocks(
|
|
90
|
+
text: str,
|
|
91
|
+
blocks: list[Block],
|
|
92
|
+
depth: int,
|
|
93
|
+
max_depth: int,
|
|
94
|
+
out: list[BaseBlock],
|
|
95
|
+
) -> None:
|
|
96
|
+
"""
|
|
97
|
+
Recursively collect base blocks. Lists decompose into their list_item
|
|
98
|
+
children; list items decompose further if they contain nested lists (up to
|
|
99
|
+
`max_depth`). Atomic blocks and blockquotes are emitted whole.
|
|
100
|
+
"""
|
|
101
|
+
for block in blocks:
|
|
102
|
+
if block.type in _ATOMIC_TYPES:
|
|
103
|
+
out.append(BaseBlock(block=block, depth=depth))
|
|
104
|
+
elif block.type in _LIST_TYPES:
|
|
105
|
+
if max_depth == 0:
|
|
106
|
+
# Lists not split: emit the whole list as one base block.
|
|
107
|
+
out.append(BaseBlock(block=block, depth=depth))
|
|
108
|
+
else:
|
|
109
|
+
# Decompose: each list_item child becomes a base block (or further
|
|
110
|
+
# decomposes if it contains nested lists).
|
|
111
|
+
for item in block.children:
|
|
112
|
+
_emit_list_item(text, item, depth, max_depth, 1, out)
|
|
113
|
+
elif block.type == BlockType.list_item:
|
|
114
|
+
# A bare list_item at the top level (unusual) is treated as atomic.
|
|
115
|
+
out.append(BaseBlock(block=block, depth=depth))
|
|
116
|
+
else:
|
|
117
|
+
# Any other block type not explicitly handled: emit as atomic.
|
|
118
|
+
out.append(BaseBlock(block=block, depth=depth))
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _emit_list_item(
|
|
122
|
+
text: str,
|
|
123
|
+
item: Block,
|
|
124
|
+
depth: int,
|
|
125
|
+
max_depth: int,
|
|
126
|
+
current_nesting: int,
|
|
127
|
+
out: list[BaseBlock],
|
|
128
|
+
) -> None:
|
|
129
|
+
"""
|
|
130
|
+
Emit a list item as base blocks. A leaf item (no nested lists, or at the depth
|
|
131
|
+
limit) emits its full span as one block. Otherwise the item decomposes, in source
|
|
132
|
+
order, into:
|
|
133
|
+
|
|
134
|
+
- one `list_item` head block spanning from the item start (the list marker) through
|
|
135
|
+
its lead content up to the first nested sublist, at `depth`;
|
|
136
|
+
- each nested sublist's items, recursed at `depth + 1`;
|
|
137
|
+
- each *continuation* block (content after or between sublists) emitted with its own
|
|
138
|
+
real block type (e.g. `paragraph`), at `depth`.
|
|
139
|
+
|
|
140
|
+
Continuation content keeps its real type rather than being mislabeled `list_item`, so
|
|
141
|
+
a consumer can tell a continuation paragraph apart from an independent list item.
|
|
142
|
+
Spans are non-overlapping and cover every non-whitespace character. Exact source
|
|
143
|
+
reconstruction is via each block's `source_span` (or the structural `blocks()` tree),
|
|
144
|
+
not by concatenating base-block text: list-marker and continuation indentation are
|
|
145
|
+
whitespace outside the trimmed spans.
|
|
146
|
+
"""
|
|
147
|
+
nested_lists = [c for c in item.children if c.type in _LIST_TYPES]
|
|
148
|
+
at_depth_limit = max_depth != -1 and current_nesting >= max_depth
|
|
149
|
+
|
|
150
|
+
if not nested_lists or at_depth_limit:
|
|
151
|
+
out.append(BaseBlock(block=item, depth=depth))
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
# Head: the marker plus lead content up to the first nested sublist, typed list_item.
|
|
155
|
+
first_nested_start = min(c.span[0] for c in nested_lists)
|
|
156
|
+
head_end = first_nested_start
|
|
157
|
+
while head_end > item.span[0] and text[head_end - 1].isspace():
|
|
158
|
+
head_end -= 1
|
|
159
|
+
if head_end > item.span[0]:
|
|
160
|
+
head = Block(type=item.type, span=(item.span[0], head_end), children=[], tight=item.tight)
|
|
161
|
+
out.append(BaseBlock(block=head, depth=depth))
|
|
162
|
+
|
|
163
|
+
# Then, in source order: recurse into each sublist; emit each continuation block
|
|
164
|
+
# (any non-list child past the head) with its own real type.
|
|
165
|
+
for child in item.children:
|
|
166
|
+
if child.type in _LIST_TYPES:
|
|
167
|
+
for nested_item in child.children:
|
|
168
|
+
_emit_list_item(text, nested_item, depth + 1, max_depth, current_nesting + 1, out)
|
|
169
|
+
elif child.span[1] > head_end:
|
|
170
|
+
out.append(BaseBlock(block=child, depth=depth))
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Typed, parser-authoritative metadata for code, table, and list blocks.
|
|
3
|
+
|
|
4
|
+
The structs (`CodeInfo`, `TableInfo`, `ListInfo`) and the pure extractors that read them
|
|
5
|
+
off a marko block element are the single place that knows how to pull a
|
|
6
|
+
language/dimension/list fact from the parse. Both the structural `Block` path (the
|
|
7
|
+
density-invariant source of truth) and the `Paragraph` editing-view path reuse these, so
|
|
8
|
+
neither re-parses and both agree.
|
|
9
|
+
|
|
10
|
+
Extraction is parser-authoritative: every fact comes from a marko element attribute
|
|
11
|
+
(`FencedCode.lang`, `Table.num_of_cols` and per-cell `.align`, `List.ordered`/`.start`/
|
|
12
|
+
subtree), never a regex over source text, matching `block_types.py`'s rule.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import Literal
|
|
19
|
+
|
|
20
|
+
from marko.block import CodeBlock, FencedCode, List, ListItem
|
|
21
|
+
from marko.element import Element
|
|
22
|
+
from marko.ext.gfm.elements import Table, TableCell, TableRow
|
|
23
|
+
|
|
24
|
+
Alignment = Literal["left", "center", "right", "default"]
|
|
25
|
+
"""A table column's alignment; `default` when the delimiter row leaves it undefined (no
|
|
26
|
+
colon), so the per-column list is always explicit strings, never empty/null entries."""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class CodeInfo:
|
|
31
|
+
"""Typed metadata for a code block."""
|
|
32
|
+
|
|
33
|
+
language: str | None
|
|
34
|
+
"""Fenced info-string language; `None` for an indented code block (no info string)."""
|
|
35
|
+
line_count: int
|
|
36
|
+
"""Body lines, excluding the fence lines."""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(frozen=True)
|
|
40
|
+
class TableInfo:
|
|
41
|
+
"""Typed metadata for a GFM table."""
|
|
42
|
+
|
|
43
|
+
rows: int
|
|
44
|
+
"""Total rows, including the header row."""
|
|
45
|
+
cols: int
|
|
46
|
+
"""Columns (marko `Table.num_of_cols`)."""
|
|
47
|
+
cells: int
|
|
48
|
+
"""`rows * cols`."""
|
|
49
|
+
alignments: list[Alignment]
|
|
50
|
+
"""Per-column alignment, length `cols`."""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(frozen=True)
|
|
54
|
+
class ListInfo:
|
|
55
|
+
"""Typed metadata for a list block."""
|
|
56
|
+
|
|
57
|
+
ordered: bool
|
|
58
|
+
start: int | None
|
|
59
|
+
"""`List.start` when ordered (e.g. `3` for `3.`), else `None`."""
|
|
60
|
+
max_depth: int
|
|
61
|
+
"""`1` for a flat list, `2` with one level of nested sublist, and so on."""
|
|
62
|
+
item_count: int
|
|
63
|
+
"""Direct `list_item` children."""
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _code_body(element: FencedCode | CodeBlock) -> str:
|
|
67
|
+
"""The code block's body text: its `RawText` children concatenated."""
|
|
68
|
+
parts: list[str] = []
|
|
69
|
+
for child in getattr(element, "children", []) or []:
|
|
70
|
+
raw = getattr(child, "children", None)
|
|
71
|
+
if isinstance(raw, str):
|
|
72
|
+
parts.append(raw)
|
|
73
|
+
return "".join(parts)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def code_info_for(element: Element) -> CodeInfo | None:
|
|
77
|
+
"""`CodeInfo` if `element` is a fenced or indented code block, else `None`."""
|
|
78
|
+
if not isinstance(element, (FencedCode, CodeBlock)):
|
|
79
|
+
return None
|
|
80
|
+
lang = getattr(element, "lang", "") or ""
|
|
81
|
+
return CodeInfo(language=lang or None, line_count=len(_code_body(element).splitlines()))
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _alignment(cell: object) -> Alignment:
|
|
85
|
+
"""A cell's column alignment; `default` when the delimiter leaves it undefined."""
|
|
86
|
+
match getattr(cell, "align", None):
|
|
87
|
+
case "left" | "center" | "right" as align:
|
|
88
|
+
return align
|
|
89
|
+
case _:
|
|
90
|
+
return "default"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def table_info_for(element: Element) -> TableInfo | None:
|
|
94
|
+
"""`TableInfo` if `element` is a GFM table, else `None`."""
|
|
95
|
+
if not isinstance(element, Table):
|
|
96
|
+
return None
|
|
97
|
+
table_rows = [c for c in element.children if isinstance(c, TableRow)]
|
|
98
|
+
cols = int(getattr(element, "num_of_cols", 0) or 0)
|
|
99
|
+
header_cells = (
|
|
100
|
+
[c for c in table_rows[0].children if isinstance(c, TableCell)] if table_rows else []
|
|
101
|
+
)
|
|
102
|
+
alignments: list[Alignment] = [_alignment(c) for c in header_cells]
|
|
103
|
+
rows = len(table_rows)
|
|
104
|
+
return TableInfo(rows=rows, cols=cols, cells=rows * cols, alignments=alignments)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _list_max_depth(element: List) -> int:
|
|
108
|
+
"""Max list nesting: `1` for a flat list, `+1` per level of nested sublist."""
|
|
109
|
+
deepest = 0
|
|
110
|
+
for item in element.children:
|
|
111
|
+
if not isinstance(item, ListItem):
|
|
112
|
+
continue
|
|
113
|
+
for sub in getattr(item, "children", []) or []:
|
|
114
|
+
if isinstance(sub, List):
|
|
115
|
+
deepest = max(deepest, _list_max_depth(sub))
|
|
116
|
+
return 1 + deepest
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def list_info_for(element: Element) -> ListInfo | None:
|
|
120
|
+
"""`ListInfo` if `element` is a list, else `None`."""
|
|
121
|
+
if not isinstance(element, List):
|
|
122
|
+
return None
|
|
123
|
+
ordered = bool(element.ordered)
|
|
124
|
+
start = int(element.start) if ordered and getattr(element, "start", None) is not None else None
|
|
125
|
+
item_count = sum(1 for c in element.children if isinstance(c, ListItem))
|
|
126
|
+
return ListInfo(
|
|
127
|
+
ordered=ordered, start=start, max_depth=_list_max_depth(element), item_count=item_count
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
## Tests
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _parse_first(markdown: str) -> Element:
|
|
135
|
+
from flowmark import flowmark_markdown
|
|
136
|
+
from marko.block import BlankLine
|
|
137
|
+
|
|
138
|
+
parsed = flowmark_markdown().parse(markdown)
|
|
139
|
+
return next(el for el in parsed.children if not isinstance(el, BlankLine))
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def test_code_info_extractor():
|
|
143
|
+
assert code_info_for(_parse_first("```python\nx = 1\ny = 2\n```\n")) == CodeInfo("python", 2)
|
|
144
|
+
# Indented code has no info string, so language is None.
|
|
145
|
+
assert code_info_for(_parse_first(" indented\n code\n")) == CodeInfo(None, 2)
|
|
146
|
+
# A non-code element yields None.
|
|
147
|
+
assert code_info_for(_parse_first("just a paragraph\n")) is None
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def test_table_info_extractor():
|
|
151
|
+
table = _parse_first("| a | b | c |\n|:--|:-:|--:|\n| 1 | 2 | 3 |\n| 4 | 5 | 6 |\n")
|
|
152
|
+
info = table_info_for(table)
|
|
153
|
+
assert info == TableInfo(rows=3, cols=3, cells=9, alignments=["left", "center", "right"])
|
|
154
|
+
# Columns with no alignment marker are "default", never empty/None.
|
|
155
|
+
plain = _parse_first("| a | b |\n| - | - |\n| 1 | 2 |\n")
|
|
156
|
+
assert table_info_for(plain) == TableInfo(
|
|
157
|
+
rows=2, cols=2, cells=4, alignments=["default", "default"]
|
|
158
|
+
)
|
|
159
|
+
assert table_info_for(_parse_first("paragraph\n")) is None
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def test_list_info_extractor():
|
|
163
|
+
ordered = _parse_first("3. a\n4. b\n - nested1\n - nested2\n5. c\n")
|
|
164
|
+
assert list_info_for(ordered) == ListInfo(ordered=True, start=3, max_depth=2, item_count=3)
|
|
165
|
+
flat = _parse_first("- a\n- b\n")
|
|
166
|
+
assert list_info_for(flat) == ListInfo(ordered=False, start=None, max_depth=1, item_count=2)
|
|
167
|
+
assert list_info_for(_parse_first("paragraph\n")) is None
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Structural block tree for a Markdown document, with exact source spans.
|
|
3
|
+
|
|
4
|
+
This is the opt-in, whole-document structural view (`FlexDoc.blocks()`) that resolves
|
|
5
|
+
what blank-line paragraph splitting cannot: it keeps a fenced code block whole even when
|
|
6
|
+
it contains blank lines, and it decomposes a list into individual `list_item`s with
|
|
7
|
+
nested sublists regardless of item spacing.
|
|
8
|
+
|
|
9
|
+
Block boundaries and spans come straight from flowmark's parser: every block element
|
|
10
|
+
produced by `flowmark_markdown().parse(text)` carries an authoritative
|
|
11
|
+
`element.span = (start, end)` read from marko's own parser state (see
|
|
12
|
+
`flowmark.markdown_ast.block_span`). flexdoc makes no block-boundary decisions of its
|
|
13
|
+
own, so there is no regex scanner and no per-line heuristic.
|
|
14
|
+
|
|
15
|
+
Containers (lists, list items, blockquotes) fully populate their block children
|
|
16
|
+
recursively, so a table inside a blockquote or a paragraph inside a list item is
|
|
17
|
+
reachable in the tree. The top-level `blocks()`/`parse_blocks` ordering and the `Block`
|
|
18
|
+
dataclass shape are unchanged.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from collections.abc import Iterator
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
|
|
26
|
+
from flowmark import flowmark_markdown
|
|
27
|
+
from flowmark.markdown_ast import block_span
|
|
28
|
+
from marko.block import BlankLine, CodeBlock, List, ListItem
|
|
29
|
+
from marko.block import Quote as MarkoQuote
|
|
30
|
+
from marko.element import Element
|
|
31
|
+
|
|
32
|
+
from flexdoc.docs.block_info import (
|
|
33
|
+
CodeInfo,
|
|
34
|
+
ListInfo,
|
|
35
|
+
TableInfo,
|
|
36
|
+
code_info_for,
|
|
37
|
+
list_info_for,
|
|
38
|
+
table_info_for,
|
|
39
|
+
)
|
|
40
|
+
from flexdoc.docs.block_types import BlockType, block_type_for
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class Block:
|
|
45
|
+
"""
|
|
46
|
+
A structural block with an exact `[start, end)` span into the source.
|
|
47
|
+
|
|
48
|
+
`children` holds nested blocks: a `list`/`ordered_list` block's children are its
|
|
49
|
+
`list_item`s, and a `list_item`'s children are ALL its block children (paragraphs,
|
|
50
|
+
nested lists, tables, code, etc.). A `blockquote`'s children are its nested blocks.
|
|
51
|
+
Leaf blocks (heading, code, table, thematic_break, etc.) have no children. `span` is
|
|
52
|
+
trimmed of surrounding whitespace, so `source[start:end]` is the block's exact text.
|
|
53
|
+
|
|
54
|
+
`tight` carries CommonMark list density for `list`/`ordered_list` blocks (`True` when
|
|
55
|
+
items have no blank lines between them), and is `None` for every other block type.
|
|
56
|
+
The block tree is density-invariant: a loose list still decomposes into one list
|
|
57
|
+
block with the same `list_item` children as its tight form, so `tight` records the
|
|
58
|
+
spacing without changing the structure or the tallies.
|
|
59
|
+
|
|
60
|
+
`code_info`, `table_info`, and `list_info` carry typed, parser-authoritative metadata
|
|
61
|
+
(see `block_info`): each is non-`None` only for its block kind (`code` / `table` /
|
|
62
|
+
`list`/`ordered_list`). They are derived facts about the same source span, so they do
|
|
63
|
+
not participate in equality or `repr` (a block's identity is its type/span/children).
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
type: BlockType
|
|
67
|
+
span: tuple[int, int]
|
|
68
|
+
children: list[Block] = field(default_factory=list)
|
|
69
|
+
tight: bool | None = None
|
|
70
|
+
code_info: CodeInfo | None = field(default=None, compare=False, repr=False)
|
|
71
|
+
table_info: TableInfo | None = field(default=None, compare=False, repr=False)
|
|
72
|
+
list_info: ListInfo | None = field(default=None, compare=False, repr=False)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def parse_blocks(text: str, parsed: Element | None = None) -> list[Block]:
|
|
76
|
+
"""
|
|
77
|
+
Parse `text` into a tree of structural `Block`s with exact source spans.
|
|
78
|
+
|
|
79
|
+
`parsed` is the marko parse of `text`; pass it to reuse a shared parse (the caller
|
|
80
|
+
guarantees it is the parse of exactly this `text`), else `text` is parsed here.
|
|
81
|
+
"""
|
|
82
|
+
return _blocks_from(text, parsed if parsed is not None else flowmark_markdown().parse(text))
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def walk_blocks(blocks: list[Block], _depth: int = 0) -> Iterator[tuple[Block, int]]:
|
|
86
|
+
"""
|
|
87
|
+
Depth-first traversal of a block tree, yielding `(block, depth)` pairs.
|
|
88
|
+
Top-level blocks have depth 0; their children have depth 1, and so on.
|
|
89
|
+
"""
|
|
90
|
+
for block in blocks:
|
|
91
|
+
yield block, _depth
|
|
92
|
+
yield from walk_blocks(block.children, _depth + 1)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _trim(text: str, lo: int, hi: int, *, keep_leading: bool = False) -> tuple[int, int]:
|
|
96
|
+
"""Shrink a span to drop surrounding whitespace (marko spans include trailing newlines
|
|
97
|
+
and a nested element's leading indentation/marker line). When `keep_leading` is True,
|
|
98
|
+
only trailing whitespace is stripped (for indented code blocks whose leading spaces are
|
|
99
|
+
syntax)."""
|
|
100
|
+
if not keep_leading:
|
|
101
|
+
while lo < hi and text[lo].isspace():
|
|
102
|
+
lo += 1
|
|
103
|
+
while hi > lo and text[hi - 1].isspace():
|
|
104
|
+
hi -= 1
|
|
105
|
+
return lo, hi
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _blocks_from(text: str, parent: Element) -> list[Block]:
|
|
109
|
+
"""
|
|
110
|
+
Build `Block`s from `parent`'s block children, skipping blank lines. Every
|
|
111
|
+
container populates all its block children recursively: a `list` decomposes into
|
|
112
|
+
`list_item`s, a `list_item` keeps all its block children (paragraphs, nested lists,
|
|
113
|
+
tables, code, etc.), and a `blockquote` keeps all its nested blocks.
|
|
114
|
+
"""
|
|
115
|
+
blocks: list[Block] = []
|
|
116
|
+
children: list[Element] = getattr(parent, "children", []) or []
|
|
117
|
+
for element in children:
|
|
118
|
+
if isinstance(element, BlankLine):
|
|
119
|
+
continue
|
|
120
|
+
block_type = block_type_for(element)
|
|
121
|
+
# Indented code blocks: preserve leading whitespace (the 4-space indent is syntax).
|
|
122
|
+
span = _trim(text, *block_span(element), keep_leading=isinstance(element, CodeBlock))
|
|
123
|
+
tight: bool | None = None
|
|
124
|
+
if isinstance(element, List):
|
|
125
|
+
sub = _blocks_from(text, element)
|
|
126
|
+
tight = element.tight
|
|
127
|
+
elif isinstance(element, (ListItem, MarkoQuote)):
|
|
128
|
+
sub = _blocks_from(text, element)
|
|
129
|
+
else:
|
|
130
|
+
sub = []
|
|
131
|
+
blocks.append(
|
|
132
|
+
Block(
|
|
133
|
+
block_type,
|
|
134
|
+
span,
|
|
135
|
+
sub,
|
|
136
|
+
tight,
|
|
137
|
+
code_info=code_info_for(element),
|
|
138
|
+
table_info=table_info_for(element),
|
|
139
|
+
list_info=list_info_for(element),
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
return blocks
|