markdowndata 0.0.2__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markdowndata/content_parser.py +79 -13
- markdowndata/utils.py +26 -2
- {markdowndata-0.0.2.dist-info → markdowndata-0.0.3.dist-info}/METADATA +1 -1
- markdowndata-0.0.3.dist-info/RECORD +9 -0
- markdowndata-0.0.2.dist-info/RECORD +0 -9
- {markdowndata-0.0.2.dist-info → markdowndata-0.0.3.dist-info}/LICENSE +0 -0
- {markdowndata-0.0.2.dist-info → markdowndata-0.0.3.dist-info}/WHEEL +0 -0
markdowndata/content_parser.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import yaml
|
|
3
3
|
|
|
4
|
-
from .utils import convert_value, get_md_soup
|
|
4
|
+
from .utils import convert_value, get_md_soup, is_single_tag_block, is_structural_line
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def detect_value_type(text: str) -> str | None:
|
|
8
8
|
"""
|
|
9
9
|
Detect the type of content block (YAML, table, list, or text).
|
|
10
|
-
|
|
10
|
+
Only classifies as md_table or md_list if the content is *only* that structure.
|
|
11
11
|
"""
|
|
12
12
|
text = text.strip()
|
|
13
13
|
if not text:
|
|
@@ -17,16 +17,19 @@ def detect_value_type(text: str) -> str | None:
|
|
|
17
17
|
if re.search(r'===\s*\n(.*?)\n===', text, re.DOTALL):
|
|
18
18
|
return 'yaml_dict'
|
|
19
19
|
|
|
20
|
-
# Convert markdown to HTML
|
|
20
|
+
# Convert markdown to HTML
|
|
21
21
|
soup = get_md_soup(text)
|
|
22
|
-
|
|
22
|
+
|
|
23
|
+
# Check for exactly one <table> and no other tags or text
|
|
24
|
+
if soup.find('table') and is_single_tag_block(soup, 'table'):
|
|
23
25
|
return 'md_table'
|
|
24
|
-
|
|
26
|
+
|
|
27
|
+
# Check for exactly one <ul> and no other tags or text
|
|
28
|
+
if soup.find('ul') and is_single_tag_block(soup, 'ul'):
|
|
25
29
|
return 'md_list'
|
|
26
|
-
elif soup.get_text(strip=True):
|
|
27
|
-
return 'md_text'
|
|
28
30
|
|
|
29
|
-
|
|
31
|
+
# Fallback: process everything as Markdown text
|
|
32
|
+
return 'md_text'
|
|
30
33
|
|
|
31
34
|
|
|
32
35
|
def yaml_dict_parser(text: str) -> dict:
|
|
@@ -77,12 +80,75 @@ def md_list_parser(text: str) -> list:
|
|
|
77
80
|
|
|
78
81
|
def md_text_parser(text: str) -> str:
|
|
79
82
|
"""
|
|
80
|
-
Parse
|
|
81
|
-
|
|
83
|
+
Parse Markdown text by:
|
|
84
|
+
- Joining lines separated by a single newline (soft breaks)
|
|
85
|
+
- Preserving formatting (bold, italic, headers, code, code block, etc.)
|
|
86
|
+
- Preserving paragraph breaks (double or more newlines)
|
|
87
|
+
- Preserving fenced code blocks exactly
|
|
82
88
|
"""
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
89
|
+
process_text = process_code_blocks(text, process_soft_breaks)
|
|
90
|
+
return convert_value(process_text)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def process_code_blocks(text: str, non_code_callback) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Process Markdown text, preserving fenced code blocks and applying a transformation
|
|
96
|
+
only to the non-code sections using `non_code_callback`.
|
|
97
|
+
"""
|
|
98
|
+
code_pattern = re.compile(r'```.*?```', re.DOTALL)
|
|
99
|
+
result = []
|
|
100
|
+
last_end = 0
|
|
101
|
+
|
|
102
|
+
for match in code_pattern.finditer(text):
|
|
103
|
+
start, end = match.span()
|
|
104
|
+
non_code_part = text[last_end:start]
|
|
105
|
+
code_block = match.group()
|
|
106
|
+
|
|
107
|
+
if non_code_part.strip():
|
|
108
|
+
result.append(non_code_callback(non_code_part))
|
|
109
|
+
result.append(code_block)
|
|
110
|
+
|
|
111
|
+
last_end = end
|
|
112
|
+
|
|
113
|
+
remaining = text[last_end:]
|
|
114
|
+
if remaining.strip():
|
|
115
|
+
result.append(non_code_callback(remaining))
|
|
116
|
+
|
|
117
|
+
return '\n\n'.join(result)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def process_soft_breaks(text: str) -> str:
|
|
121
|
+
"""
|
|
122
|
+
Joins lines separated by a single newline, except for Markdown block elements:
|
|
123
|
+
- Lists
|
|
124
|
+
- Tables
|
|
125
|
+
- Blockquotes
|
|
126
|
+
Preserves paragraph breaks (2+ newlines).
|
|
127
|
+
"""
|
|
128
|
+
lines = text.split('\n')
|
|
129
|
+
processed = []
|
|
130
|
+
paragraph_lines = []
|
|
131
|
+
|
|
132
|
+
for line in lines:
|
|
133
|
+
if line.strip() == '':
|
|
134
|
+
if paragraph_lines:
|
|
135
|
+
processed.append(' '.join(paragraph_lines))
|
|
136
|
+
paragraph_lines = []
|
|
137
|
+
processed.append('')
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
if is_structural_line(line):
|
|
141
|
+
if paragraph_lines:
|
|
142
|
+
processed.append(' '.join(paragraph_lines))
|
|
143
|
+
paragraph_lines = []
|
|
144
|
+
processed.append(line)
|
|
145
|
+
else:
|
|
146
|
+
paragraph_lines.append(line.strip())
|
|
147
|
+
|
|
148
|
+
if paragraph_lines:
|
|
149
|
+
processed.append(' '.join(paragraph_lines))
|
|
150
|
+
|
|
151
|
+
return '\n'.join(processed)
|
|
86
152
|
|
|
87
153
|
|
|
88
154
|
def parse_content_block(text: str):
|
markdowndata/utils.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from typing import Union
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
|
|
@@ -5,6 +6,9 @@ from bs4 import BeautifulSoup
|
|
|
5
6
|
from markdown_it import MarkdownIt
|
|
6
7
|
|
|
7
8
|
|
|
9
|
+
STRUCTURAL_LINE_RE = re.compile(r'^(\s*[-*+]\s+|\s*\d+\.\s+|\|.+\||>\s*)')
|
|
10
|
+
|
|
11
|
+
|
|
8
12
|
@dataclass
|
|
9
13
|
class Section:
|
|
10
14
|
"""
|
|
@@ -41,13 +45,33 @@ def get_md_soup(text: str) -> BeautifulSoup:
|
|
|
41
45
|
return BeautifulSoup(html, 'html.parser')
|
|
42
46
|
|
|
43
47
|
|
|
48
|
+
def is_single_tag_block(soup, tag_name: str) -> bool:
|
|
49
|
+
"""
|
|
50
|
+
Check if the block consists of a single top-level tag (e.g. <table>, <ul>)
|
|
51
|
+
with no other sibling tags.
|
|
52
|
+
"""
|
|
53
|
+
tags = soup.find_all(recursive=False)
|
|
54
|
+
return len(tags) == 1 and tags[0].name == tag_name
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def is_structural_line(line: str) -> bool:
|
|
58
|
+
"""
|
|
59
|
+
Returns True if the line is a Markdown block element (list, table row, blockquote).
|
|
60
|
+
"""
|
|
61
|
+
return bool(STRUCTURAL_LINE_RE.match(line))
|
|
62
|
+
|
|
63
|
+
|
|
44
64
|
def convert_value(value: str) -> Union[int, float, str]:
|
|
45
65
|
"""
|
|
46
66
|
Convert a string to an int, float, or datetime object is possible, or return the original string.
|
|
47
67
|
"""
|
|
48
68
|
try:
|
|
49
69
|
value = value.strip()
|
|
50
|
-
|
|
51
|
-
|
|
70
|
+
if value.isdigit():
|
|
71
|
+
return int(value)
|
|
72
|
+
elif '.' in value:
|
|
73
|
+
return float(value)
|
|
74
|
+
else:
|
|
75
|
+
return value
|
|
52
76
|
except (ValueError, AttributeError):
|
|
53
77
|
return value
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
markdowndata/__init__.py,sha256=TBg-b5wXBdEfdUCOOOc7goTBgJVYPgH-dTb7U2D3n7Y,141
|
|
2
|
+
markdowndata/content_parser.py,sha256=FdapNfkvCj71K5lUSGQh0H1jS5vF36-l1EImsX28J2o,5387
|
|
3
|
+
markdowndata/process_markdown.py,sha256=cQJ0yQZpCzFC6AbXpZGWZWLN5XODLS9OqfkXWLL_usU,2166
|
|
4
|
+
markdowndata/section_tree.py,sha256=X4kPqHCa_2hkiq6roOHUc22G3pNIQjpUv_ni_J14XXQ,2060
|
|
5
|
+
markdowndata/utils.py,sha256=jVTKEGZiLCKdv1cS-73iMf_GaXT6cBc3cl9tp_fjKOw,2040
|
|
6
|
+
markdowndata-0.0.3.dist-info/LICENSE,sha256=K-k1T7XcwAVVmLsHhfWMye6r7p45xz3xwv5S5FBSyZE,1074
|
|
7
|
+
markdowndata-0.0.3.dist-info/METADATA,sha256=W5askwPGg46BYTkOH7OX_7j_z1ndqpnN4OiZK-w7kCE,633
|
|
8
|
+
markdowndata-0.0.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
9
|
+
markdowndata-0.0.3.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
markdowndata/__init__.py,sha256=TBg-b5wXBdEfdUCOOOc7goTBgJVYPgH-dTb7U2D3n7Y,141
|
|
2
|
-
markdowndata/content_parser.py,sha256=efO93hXGdWJd-t-X3-lqMpVi57T_5K7_2Hs5ivcTd3I,3379
|
|
3
|
-
markdowndata/process_markdown.py,sha256=cQJ0yQZpCzFC6AbXpZGWZWLN5XODLS9OqfkXWLL_usU,2166
|
|
4
|
-
markdowndata/section_tree.py,sha256=X4kPqHCa_2hkiq6roOHUc22G3pNIQjpUv_ni_J14XXQ,2060
|
|
5
|
-
markdowndata/utils.py,sha256=slrdI-G2iNgRI3mLrLj4uKqpLnKAsZGNA5gjjdYOC7g,1400
|
|
6
|
-
markdowndata-0.0.2.dist-info/LICENSE,sha256=K-k1T7XcwAVVmLsHhfWMye6r7p45xz3xwv5S5FBSyZE,1074
|
|
7
|
-
markdowndata-0.0.2.dist-info/METADATA,sha256=uLOKSKfBKawMLBR1hDzpVwbSsR9yNaog4SNuj13YxkU,633
|
|
8
|
-
markdowndata-0.0.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
9
|
-
markdowndata-0.0.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|