markdowndata 0.0.1__tar.gz → 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {markdowndata-0.0.1 → markdowndata-0.0.3}/PKG-INFO +2 -2
- markdowndata-0.0.3/markdowndata/content_parser.py +180 -0
- {markdowndata-0.0.1 → markdowndata-0.0.3}/markdowndata/utils.py +26 -2
- {markdowndata-0.0.1 → markdowndata-0.0.3}/pyproject.toml +1 -1
- markdowndata-0.0.1/markdowndata/content_parser.py +0 -114
- {markdowndata-0.0.1 → markdowndata-0.0.3}/LICENSE +0 -0
- {markdowndata-0.0.1 → markdowndata-0.0.3}/markdowndata/__init__.py +0 -0
- {markdowndata-0.0.1 → markdowndata-0.0.3}/markdowndata/process_markdown.py +0 -0
- {markdowndata-0.0.1 → markdowndata-0.0.3}/markdowndata/section_tree.py +0 -0
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import yaml
|
|
3
|
+
|
|
4
|
+
from .utils import convert_value, get_md_soup, is_single_tag_block, is_structural_line
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def detect_value_type(text: str) -> str | None:
|
|
8
|
+
"""
|
|
9
|
+
Detect the type of content block (YAML, table, list, or text).
|
|
10
|
+
Only classifies as md_table or md_list if the content is *only* that structure.
|
|
11
|
+
"""
|
|
12
|
+
text = text.strip()
|
|
13
|
+
if not text:
|
|
14
|
+
return None
|
|
15
|
+
|
|
16
|
+
# YAML detection (delimited by ===)
|
|
17
|
+
if re.search(r'===\s*\n(.*?)\n===', text, re.DOTALL):
|
|
18
|
+
return 'yaml_dict'
|
|
19
|
+
|
|
20
|
+
# Convert markdown to HTML
|
|
21
|
+
soup = get_md_soup(text)
|
|
22
|
+
|
|
23
|
+
# Check for exactly one <table> and no other tags or text
|
|
24
|
+
if soup.find('table') and is_single_tag_block(soup, 'table'):
|
|
25
|
+
return 'md_table'
|
|
26
|
+
|
|
27
|
+
# Check for exactly one <ul> and no other tags or text
|
|
28
|
+
if soup.find('ul') and is_single_tag_block(soup, 'ul'):
|
|
29
|
+
return 'md_list'
|
|
30
|
+
|
|
31
|
+
# Fallback: process everything as Markdown text
|
|
32
|
+
return 'md_text'
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def yaml_dict_parser(text: str) -> dict:
|
|
36
|
+
"""
|
|
37
|
+
Parse YAML from a string (surrounded by ===) and returns it as a dictionary.
|
|
38
|
+
Assumes YAML is a block at the beginning of the text.
|
|
39
|
+
"""
|
|
40
|
+
match = re.search(r'===\s*\n(.*?)\n===', text, re.DOTALL)
|
|
41
|
+
if match:
|
|
42
|
+
yaml_data = yaml.safe_load(match.group(1))
|
|
43
|
+
if yaml_data:
|
|
44
|
+
return {k: convert_value(v) for k, v in yaml_data.items()}
|
|
45
|
+
return {}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def md_table_parser(text: str) -> list[dict]:
|
|
49
|
+
"""
|
|
50
|
+
Parse a Markdown table and returns it as a list of dictionaries.
|
|
51
|
+
Assumes the markdown is converted to HTML with <table> elements.
|
|
52
|
+
"""
|
|
53
|
+
soup = get_md_soup(text)
|
|
54
|
+
table = soup.find('table')
|
|
55
|
+
if not table:
|
|
56
|
+
return []
|
|
57
|
+
|
|
58
|
+
# Extract headers and row data
|
|
59
|
+
headers = [th.get_text(strip=True) for th in table.find_all('th')]
|
|
60
|
+
rows = []
|
|
61
|
+
for tr in table.find_all('tr')[1:]: # Skip header row
|
|
62
|
+
cells = [convert_value(td.get_text(strip=True)) for td in tr.find_all(['td', 'th'])]
|
|
63
|
+
if len(cells) == len(headers):
|
|
64
|
+
rows.append(dict(zip(headers, cells)))
|
|
65
|
+
|
|
66
|
+
return rows
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def md_list_parser(text: str) -> list:
|
|
70
|
+
"""
|
|
71
|
+
Parse a Markdown list and returns it as a list of values.
|
|
72
|
+
Assumes the markdown is converted to HTML with <ul> elements.
|
|
73
|
+
"""
|
|
74
|
+
soup = get_md_soup(text)
|
|
75
|
+
ul = soup.find('ul')
|
|
76
|
+
if not ul:
|
|
77
|
+
return []
|
|
78
|
+
return [convert_value(li.get_text(strip=True)) for li in ul.find_all('li')]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def md_text_parser(text: str) -> str:
|
|
82
|
+
"""
|
|
83
|
+
Parse Markdown text by:
|
|
84
|
+
- Joining lines separated by a single newline (soft breaks)
|
|
85
|
+
- Preserving formatting (bold, italic, headers, code, code block, etc.)
|
|
86
|
+
- Preserving paragraph breaks (double or more newlines)
|
|
87
|
+
- Preserving fenced code blocks exactly
|
|
88
|
+
"""
|
|
89
|
+
process_text = process_code_blocks(text, process_soft_breaks)
|
|
90
|
+
return convert_value(process_text)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def process_code_blocks(text: str, non_code_callback) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Process Markdown text, preserving fenced code blocks and applying a transformation
|
|
96
|
+
only to the non-code sections using `non_code_callback`.
|
|
97
|
+
"""
|
|
98
|
+
code_pattern = re.compile(r'```.*?```', re.DOTALL)
|
|
99
|
+
result = []
|
|
100
|
+
last_end = 0
|
|
101
|
+
|
|
102
|
+
for match in code_pattern.finditer(text):
|
|
103
|
+
start, end = match.span()
|
|
104
|
+
non_code_part = text[last_end:start]
|
|
105
|
+
code_block = match.group()
|
|
106
|
+
|
|
107
|
+
if non_code_part.strip():
|
|
108
|
+
result.append(non_code_callback(non_code_part))
|
|
109
|
+
result.append(code_block)
|
|
110
|
+
|
|
111
|
+
last_end = end
|
|
112
|
+
|
|
113
|
+
remaining = text[last_end:]
|
|
114
|
+
if remaining.strip():
|
|
115
|
+
result.append(non_code_callback(remaining))
|
|
116
|
+
|
|
117
|
+
return '\n\n'.join(result)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def process_soft_breaks(text: str) -> str:
|
|
121
|
+
"""
|
|
122
|
+
Joins lines separated by a single newline, except for Markdown block elements:
|
|
123
|
+
- Lists
|
|
124
|
+
- Tables
|
|
125
|
+
- Blockquotes
|
|
126
|
+
Preserves paragraph breaks (2+ newlines).
|
|
127
|
+
"""
|
|
128
|
+
lines = text.split('\n')
|
|
129
|
+
processed = []
|
|
130
|
+
paragraph_lines = []
|
|
131
|
+
|
|
132
|
+
for line in lines:
|
|
133
|
+
if line.strip() == '':
|
|
134
|
+
if paragraph_lines:
|
|
135
|
+
processed.append(' '.join(paragraph_lines))
|
|
136
|
+
paragraph_lines = []
|
|
137
|
+
processed.append('')
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
if is_structural_line(line):
|
|
141
|
+
if paragraph_lines:
|
|
142
|
+
processed.append(' '.join(paragraph_lines))
|
|
143
|
+
paragraph_lines = []
|
|
144
|
+
processed.append(line)
|
|
145
|
+
else:
|
|
146
|
+
paragraph_lines.append(line.strip())
|
|
147
|
+
|
|
148
|
+
if paragraph_lines:
|
|
149
|
+
processed.append(' '.join(paragraph_lines))
|
|
150
|
+
|
|
151
|
+
return '\n'.join(processed)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def parse_content_block(text: str):
|
|
155
|
+
"""
|
|
156
|
+
Parse a given block of Markdown text into structured data.
|
|
157
|
+
Automatically detects the content type (YAML, table, list, text) and
|
|
158
|
+
dispatches to the appropriate parser. Raises an error if the content
|
|
159
|
+
cannot be parsed or returns an empty result.
|
|
160
|
+
"""
|
|
161
|
+
text = text.strip()
|
|
162
|
+
if not text:
|
|
163
|
+
return {}
|
|
164
|
+
|
|
165
|
+
v_type = detect_value_type(text)
|
|
166
|
+
if not v_type:
|
|
167
|
+
raise ValueError(f'No parser found for content: {text}')
|
|
168
|
+
|
|
169
|
+
parser_functions = {
|
|
170
|
+
'yaml_dict': yaml_dict_parser,
|
|
171
|
+
'md_table': md_table_parser,
|
|
172
|
+
'md_list': md_list_parser,
|
|
173
|
+
'md_text': md_text_parser
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
parser = parser_functions[v_type]
|
|
177
|
+
value = parser(text)
|
|
178
|
+
if not value:
|
|
179
|
+
raise ValueError(f'Parser for {v_type} returned empty value for: {text}')
|
|
180
|
+
return value
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from typing import Union
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
|
|
@@ -5,6 +6,9 @@ from bs4 import BeautifulSoup
|
|
|
5
6
|
from markdown_it import MarkdownIt
|
|
6
7
|
|
|
7
8
|
|
|
9
|
+
STRUCTURAL_LINE_RE = re.compile(r'^(\s*[-*+]\s+|\s*\d+\.\s+|\|.+\||>\s*)')
|
|
10
|
+
|
|
11
|
+
|
|
8
12
|
@dataclass
|
|
9
13
|
class Section:
|
|
10
14
|
"""
|
|
@@ -41,13 +45,33 @@ def get_md_soup(text: str) -> BeautifulSoup:
|
|
|
41
45
|
return BeautifulSoup(html, 'html.parser')
|
|
42
46
|
|
|
43
47
|
|
|
48
|
+
def is_single_tag_block(soup, tag_name: str) -> bool:
|
|
49
|
+
"""
|
|
50
|
+
Check if the block consists of a single top-level tag (e.g. <table>, <ul>)
|
|
51
|
+
with no other sibling tags.
|
|
52
|
+
"""
|
|
53
|
+
tags = soup.find_all(recursive=False)
|
|
54
|
+
return len(tags) == 1 and tags[0].name == tag_name
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def is_structural_line(line: str) -> bool:
|
|
58
|
+
"""
|
|
59
|
+
Returns True if the line is a Markdown block element (list, table row, blockquote).
|
|
60
|
+
"""
|
|
61
|
+
return bool(STRUCTURAL_LINE_RE.match(line))
|
|
62
|
+
|
|
63
|
+
|
|
44
64
|
def convert_value(value: str) -> Union[int, float, str]:
|
|
45
65
|
"""
|
|
46
66
|
Convert a string to an int, float, or datetime object is possible, or return the original string.
|
|
47
67
|
"""
|
|
48
68
|
try:
|
|
49
69
|
value = value.strip()
|
|
50
|
-
|
|
51
|
-
|
|
70
|
+
if value.isdigit():
|
|
71
|
+
return int(value)
|
|
72
|
+
elif '.' in value:
|
|
73
|
+
return float(value)
|
|
74
|
+
else:
|
|
75
|
+
return value
|
|
52
76
|
except (ValueError, AttributeError):
|
|
53
77
|
return value
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
import yaml
|
|
3
|
-
|
|
4
|
-
from .utils import convert_value, get_md_soup
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def detect_value_type(text: str) -> str | None:
|
|
8
|
-
"""
|
|
9
|
-
Detect the type of content block (YAML, table, list, or text).
|
|
10
|
-
Returns the content type string or None if no match is found.
|
|
11
|
-
"""
|
|
12
|
-
text = text.strip()
|
|
13
|
-
if not text:
|
|
14
|
-
return None
|
|
15
|
-
|
|
16
|
-
# YAML detection (delimited by ---)
|
|
17
|
-
if re.search(r'---\s*\n(.*?)\n---', text, re.DOTALL):
|
|
18
|
-
return 'yaml_dict'
|
|
19
|
-
|
|
20
|
-
# Convert markdown to HTML and analyze for tables, lists, or text
|
|
21
|
-
soup = get_md_soup(text)
|
|
22
|
-
if soup.find('table'):
|
|
23
|
-
return 'md_table'
|
|
24
|
-
elif soup.find('ul'):
|
|
25
|
-
return 'md_list'
|
|
26
|
-
elif soup.get_text(strip=True):
|
|
27
|
-
return 'md_text'
|
|
28
|
-
|
|
29
|
-
return None
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def yaml_dict_parser(text: str) -> dict:
|
|
33
|
-
"""
|
|
34
|
-
Parse YAML from a string (surrounded by ---) and returns it as a dictionary.
|
|
35
|
-
Assumes YAML is a block at the beginning of the text.
|
|
36
|
-
"""
|
|
37
|
-
match = re.search(r'---\s*\n(.*?)\n---', text, re.DOTALL)
|
|
38
|
-
if match:
|
|
39
|
-
yaml_data = yaml.safe_load(match.group(1))
|
|
40
|
-
if yaml_data:
|
|
41
|
-
return {k: convert_value(v) for k, v in yaml_data.items()}
|
|
42
|
-
return {}
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def md_table_parser(text: str) -> list[dict]:
|
|
46
|
-
"""
|
|
47
|
-
Parse a Markdown table and returns it as a list of dictionaries.
|
|
48
|
-
Assumes the markdown is converted to HTML with <table> elements.
|
|
49
|
-
"""
|
|
50
|
-
soup = get_md_soup(text)
|
|
51
|
-
table = soup.find('table')
|
|
52
|
-
if not table:
|
|
53
|
-
return []
|
|
54
|
-
|
|
55
|
-
# Extract headers and row data
|
|
56
|
-
headers = [th.get_text(strip=True) for th in table.find_all('th')]
|
|
57
|
-
rows = []
|
|
58
|
-
for tr in table.find_all('tr')[1:]: # Skip header row
|
|
59
|
-
cells = [convert_value(td.get_text(strip=True)) for td in tr.find_all(['td', 'th'])]
|
|
60
|
-
if len(cells) == len(headers):
|
|
61
|
-
rows.append(dict(zip(headers, cells)))
|
|
62
|
-
|
|
63
|
-
return rows
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def md_list_parser(text: str) -> list:
|
|
67
|
-
"""
|
|
68
|
-
Parse a Markdown list and returns it as a list of values.
|
|
69
|
-
Assumes the markdown is converted to HTML with <ul> elements.
|
|
70
|
-
"""
|
|
71
|
-
soup = get_md_soup(text)
|
|
72
|
-
ul = soup.find('ul')
|
|
73
|
-
if not ul:
|
|
74
|
-
return []
|
|
75
|
-
return [convert_value(li.get_text(strip=True)) for li in ul.find_all('li')]
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def md_text_parser(text: str) -> str:
|
|
79
|
-
"""
|
|
80
|
-
Parse a Markdown text block and return its text content as a string.
|
|
81
|
-
Ensures lines flow together as a paragraph, not split across lines.
|
|
82
|
-
"""
|
|
83
|
-
soup = get_md_soup(text)
|
|
84
|
-
raw_text = soup.get_text(strip=True)
|
|
85
|
-
return convert_value(' '.join(raw_text.splitlines()))
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
def parse_content_block(text: str):
|
|
89
|
-
"""
|
|
90
|
-
Parse a given block of Markdown text into structured data.
|
|
91
|
-
Automatically detects the content type (YAML, table, list, text) and
|
|
92
|
-
dispatches to the appropriate parser. Raises an error if the content
|
|
93
|
-
cannot be parsed or returns an empty result.
|
|
94
|
-
"""
|
|
95
|
-
text = text.strip()
|
|
96
|
-
if not text:
|
|
97
|
-
return {}
|
|
98
|
-
|
|
99
|
-
v_type = detect_value_type(text)
|
|
100
|
-
if not v_type:
|
|
101
|
-
raise ValueError(f'No parser found for content: {text}')
|
|
102
|
-
|
|
103
|
-
parser_functions = {
|
|
104
|
-
'yaml_dict': yaml_dict_parser,
|
|
105
|
-
'md_table': md_table_parser,
|
|
106
|
-
'md_list': md_list_parser,
|
|
107
|
-
'md_text': md_text_parser
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
parser = parser_functions[v_type]
|
|
111
|
-
value = parser(text)
|
|
112
|
-
if not value:
|
|
113
|
-
raise ValueError(f'Parser for {v_type} returned empty value for: {text}')
|
|
114
|
-
return value
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|