markdowndata 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markdowndata-0.0.1/LICENSE +21 -0
- markdowndata-0.0.1/PKG-INFO +17 -0
- markdowndata-0.0.1/markdowndata/__init__.py +6 -0
- markdowndata-0.0.1/markdowndata/content_parser.py +114 -0
- markdowndata-0.0.1/markdowndata/process_markdown.py +59 -0
- markdowndata-0.0.1/markdowndata/section_tree.py +58 -0
- markdowndata-0.0.1/markdowndata/utils.py +53 -0
- markdowndata-0.0.1/pyproject.toml +20 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 BYU-CS-Course-Ops
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: markdowndata
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Tool to convert markdown tables into json objects
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Gordon Bean
|
|
7
|
+
Author-email: gbean@cs.byu.edu
|
|
8
|
+
Requires-Python: >=3.10,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Dist: bs4 (>=0.0.2,<0.0.3)
|
|
16
|
+
Requires-Dist: markdown-it-py (>=3.0.0,<4.0.0)
|
|
17
|
+
Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import yaml
|
|
3
|
+
|
|
4
|
+
from .utils import convert_value, get_md_soup
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def detect_value_type(text: str) -> str | None:
|
|
8
|
+
"""
|
|
9
|
+
Detect the type of content block (YAML, table, list, or text).
|
|
10
|
+
Returns the content type string or None if no match is found.
|
|
11
|
+
"""
|
|
12
|
+
text = text.strip()
|
|
13
|
+
if not text:
|
|
14
|
+
return None
|
|
15
|
+
|
|
16
|
+
# YAML detection (delimited by ---)
|
|
17
|
+
if re.search(r'---\s*\n(.*?)\n---', text, re.DOTALL):
|
|
18
|
+
return 'yaml_dict'
|
|
19
|
+
|
|
20
|
+
# Convert markdown to HTML and analyze for tables, lists, or text
|
|
21
|
+
soup = get_md_soup(text)
|
|
22
|
+
if soup.find('table'):
|
|
23
|
+
return 'md_table'
|
|
24
|
+
elif soup.find('ul'):
|
|
25
|
+
return 'md_list'
|
|
26
|
+
elif soup.get_text(strip=True):
|
|
27
|
+
return 'md_text'
|
|
28
|
+
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def yaml_dict_parser(text: str) -> dict:
|
|
33
|
+
"""
|
|
34
|
+
Parse YAML from a string (surrounded by ---) and returns it as a dictionary.
|
|
35
|
+
Assumes YAML is a block at the beginning of the text.
|
|
36
|
+
"""
|
|
37
|
+
match = re.search(r'---\s*\n(.*?)\n---', text, re.DOTALL)
|
|
38
|
+
if match:
|
|
39
|
+
yaml_data = yaml.safe_load(match.group(1))
|
|
40
|
+
if yaml_data:
|
|
41
|
+
return {k: convert_value(v) for k, v in yaml_data.items()}
|
|
42
|
+
return {}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def md_table_parser(text: str) -> list[dict]:
|
|
46
|
+
"""
|
|
47
|
+
Parse a Markdown table and returns it as a list of dictionaries.
|
|
48
|
+
Assumes the markdown is converted to HTML with <table> elements.
|
|
49
|
+
"""
|
|
50
|
+
soup = get_md_soup(text)
|
|
51
|
+
table = soup.find('table')
|
|
52
|
+
if not table:
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
# Extract headers and row data
|
|
56
|
+
headers = [th.get_text(strip=True) for th in table.find_all('th')]
|
|
57
|
+
rows = []
|
|
58
|
+
for tr in table.find_all('tr')[1:]: # Skip header row
|
|
59
|
+
cells = [convert_value(td.get_text(strip=True)) for td in tr.find_all(['td', 'th'])]
|
|
60
|
+
if len(cells) == len(headers):
|
|
61
|
+
rows.append(dict(zip(headers, cells)))
|
|
62
|
+
|
|
63
|
+
return rows
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def md_list_parser(text: str) -> list:
|
|
67
|
+
"""
|
|
68
|
+
Parse a Markdown list and returns it as a list of values.
|
|
69
|
+
Assumes the markdown is converted to HTML with <ul> elements.
|
|
70
|
+
"""
|
|
71
|
+
soup = get_md_soup(text)
|
|
72
|
+
ul = soup.find('ul')
|
|
73
|
+
if not ul:
|
|
74
|
+
return []
|
|
75
|
+
return [convert_value(li.get_text(strip=True)) for li in ul.find_all('li')]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def md_text_parser(text: str) -> str:
|
|
79
|
+
"""
|
|
80
|
+
Parse a Markdown text block and return its text content as a string.
|
|
81
|
+
Ensures lines flow together as a paragraph, not split across lines.
|
|
82
|
+
"""
|
|
83
|
+
soup = get_md_soup(text)
|
|
84
|
+
raw_text = soup.get_text(strip=True)
|
|
85
|
+
return convert_value(' '.join(raw_text.splitlines()))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def parse_content_block(text: str):
|
|
89
|
+
"""
|
|
90
|
+
Parse a given block of Markdown text into structured data.
|
|
91
|
+
Automatically detects the content type (YAML, table, list, text) and
|
|
92
|
+
dispatches to the appropriate parser. Raises an error if the content
|
|
93
|
+
cannot be parsed or returns an empty result.
|
|
94
|
+
"""
|
|
95
|
+
text = text.strip()
|
|
96
|
+
if not text:
|
|
97
|
+
return {}
|
|
98
|
+
|
|
99
|
+
v_type = detect_value_type(text)
|
|
100
|
+
if not v_type:
|
|
101
|
+
raise ValueError(f'No parser found for content: {text}')
|
|
102
|
+
|
|
103
|
+
parser_functions = {
|
|
104
|
+
'yaml_dict': yaml_dict_parser,
|
|
105
|
+
'md_table': md_table_parser,
|
|
106
|
+
'md_list': md_list_parser,
|
|
107
|
+
'md_text': md_text_parser
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
parser = parser_functions[v_type]
|
|
111
|
+
value = parser(text)
|
|
112
|
+
if not value:
|
|
113
|
+
raise ValueError(f'Parser for {v_type} returned empty value for: {text}')
|
|
114
|
+
return value
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from typing import Union, List, IO
|
|
2
|
+
from .section_tree import split_sections, build_section_tree
|
|
3
|
+
from .utils import Node
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MarkDataParser:
|
|
7
|
+
"""
|
|
8
|
+
Parses a Markdown document into a JSON-like dictionary structure.
|
|
9
|
+
Builds a hierarchy of sections and converts each section's content into a structured form.
|
|
10
|
+
"""
|
|
11
|
+
def __init__(self):
|
|
12
|
+
self.data = {}
|
|
13
|
+
|
|
14
|
+
def load(self, file: Union[str, IO]) -> dict:
|
|
15
|
+
"""
|
|
16
|
+
Loads markdown content from a file path or file-like object, parses it,
|
|
17
|
+
and builds a nested dictionary of structured data.
|
|
18
|
+
"""
|
|
19
|
+
if isinstance(file, str):
|
|
20
|
+
with open(file, 'r') as f:
|
|
21
|
+
text = f.read()
|
|
22
|
+
else:
|
|
23
|
+
text = file.read()
|
|
24
|
+
|
|
25
|
+
# Split the text into Section objects based on markdown headers
|
|
26
|
+
sections = split_sections(text)
|
|
27
|
+
|
|
28
|
+
# Build a hierarchical tree of sections and subsections
|
|
29
|
+
section_tree = build_section_tree(sections)
|
|
30
|
+
|
|
31
|
+
# Convert the section tree into a JSON-like dictionary structure
|
|
32
|
+
self.data = self.build_dict(section_tree)
|
|
33
|
+
return self.data
|
|
34
|
+
|
|
35
|
+
def build_dict(self, sections: List[Node]) -> dict:
|
|
36
|
+
"""
|
|
37
|
+
Recursively converts a list of Node objects into a JSON-like dictionary structure.
|
|
38
|
+
"""
|
|
39
|
+
result = {}
|
|
40
|
+
for node in sections:
|
|
41
|
+
sub_dict = self.build_dict(node.subsections)
|
|
42
|
+
|
|
43
|
+
if isinstance(node.parsed, dict):
|
|
44
|
+
# If the parsed content is a dictionary, merge it with its subsections
|
|
45
|
+
merged = {**node.parsed, **sub_dict}
|
|
46
|
+
elif node.subsections:
|
|
47
|
+
# If subsections exist but parsed content is not a dict,
|
|
48
|
+
# wrap both into a new dictionary
|
|
49
|
+
merged = {
|
|
50
|
+
'content': node.parsed,
|
|
51
|
+
**sub_dict
|
|
52
|
+
} if node.parsed else sub_dict
|
|
53
|
+
else:
|
|
54
|
+
# If no subsections, return the parsed content (could be string, list, or None)
|
|
55
|
+
merged = node.parsed
|
|
56
|
+
|
|
57
|
+
# Use the node's title as the key in the dictionary
|
|
58
|
+
result[node.title] = merged
|
|
59
|
+
return result
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from .utils import Section, Node
|
|
3
|
+
from .content_parser import parse_content_block
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def split_sections(text: str):
|
|
7
|
+
"""
|
|
8
|
+
Splits the Markdown text into Section objects.
|
|
9
|
+
Each section is identified by a header (e.g., #, ##, ###).
|
|
10
|
+
"""
|
|
11
|
+
pattern = re.compile(r'^(?P<header>#+) (?P<title>[^\n]+)', re.MULTILINE)
|
|
12
|
+
matches = list(pattern.finditer(text))
|
|
13
|
+
|
|
14
|
+
sections = []
|
|
15
|
+
for i, match in enumerate(matches):
|
|
16
|
+
# Calculate the 'end' of the current section:
|
|
17
|
+
# It's the start of the next header or the end of the document.
|
|
18
|
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
|
19
|
+
|
|
20
|
+
sections.append(Section(
|
|
21
|
+
title=match.group('title').strip(), # The section's title text
|
|
22
|
+
level=len(match.group('header')), # The number of # symbols indicates nesting level
|
|
23
|
+
start=match.start(), # Position where this header starts in the text
|
|
24
|
+
end=end, # Position where this section's content ends
|
|
25
|
+
content=text[match.end():end].strip() # The actual text content of this section (excluding header)
|
|
26
|
+
))
|
|
27
|
+
|
|
28
|
+
return sections
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def build_section_tree(sections):
|
|
32
|
+
"""
|
|
33
|
+
Builds a hierarchical tree of Nodes from the list of Section objects.
|
|
34
|
+
Uses a stack to track the current section hierarchy.
|
|
35
|
+
"""
|
|
36
|
+
root = Node(title='Root', level=0, parsed={}, subsections=[])
|
|
37
|
+
stack = [root]
|
|
38
|
+
|
|
39
|
+
for section in sections:
|
|
40
|
+
node = Node(
|
|
41
|
+
title=section.title,
|
|
42
|
+
level=section.level,
|
|
43
|
+
parsed=parse_content_block(section.content),
|
|
44
|
+
subsections=[]
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Find the correct parent in the hierarchy
|
|
48
|
+
while stack and stack[-1].level >= section.level:
|
|
49
|
+
stack.pop()
|
|
50
|
+
|
|
51
|
+
# Add this node as a child of the current parent
|
|
52
|
+
parent_node = stack[-1]
|
|
53
|
+
parent_node.subsections.append(node)
|
|
54
|
+
|
|
55
|
+
# Push this node to the stack (might have its own children)
|
|
56
|
+
stack.append(node)
|
|
57
|
+
|
|
58
|
+
return root.subsections
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from markdown_it import MarkdownIt
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class Section:
|
|
10
|
+
"""
|
|
11
|
+
Represents a single section of the Markdown document.
|
|
12
|
+
Contains its title, header level, location in the original text, and raw content.
|
|
13
|
+
"""
|
|
14
|
+
title: str
|
|
15
|
+
level: int
|
|
16
|
+
start: int
|
|
17
|
+
end: int
|
|
18
|
+
content: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class Node:
|
|
23
|
+
"""
|
|
24
|
+
Represents a node in the hierarchical section tree.
|
|
25
|
+
Holds the section title, header level, parsed content, and nested subsections.
|
|
26
|
+
"""
|
|
27
|
+
title: str
|
|
28
|
+
level: int
|
|
29
|
+
parsed: Union[dict, list, str]
|
|
30
|
+
subsections: list
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_md_soup(text: str) -> BeautifulSoup:
|
|
34
|
+
"""
|
|
35
|
+
Converts a Markdown text block into HTML and parses it into a BeautifulSoup object.
|
|
36
|
+
"""
|
|
37
|
+
md_parser = MarkdownIt()
|
|
38
|
+
md_parser.enable("table") # Enables markdown table parsing
|
|
39
|
+
md_parser.enable("code") # Enables markdown code parsing
|
|
40
|
+
html = md_parser.render(text)
|
|
41
|
+
return BeautifulSoup(html, 'html.parser')
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def convert_value(value: str) -> Union[int, float, str]:
|
|
45
|
+
"""
|
|
46
|
+
Convert a string to an int, float, or datetime object is possible, or return the original string.
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
value = value.strip()
|
|
50
|
+
num = float(value)
|
|
51
|
+
return int(num) if num.is_integer() else num
|
|
52
|
+
except (ValueError, AttributeError):
|
|
53
|
+
return value
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "markdowndata"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "Tool to convert markdown tables into json objects"
|
|
5
|
+
authors = ["Gordon Bean <gbean@cs.byu.edu>", "Robert Greathouse <robbykap@byu.edu>"]
|
|
6
|
+
license = "MIT"
|
|
7
|
+
|
|
8
|
+
[tool.poetry.dependencies]
|
|
9
|
+
python = "^3.10"
|
|
10
|
+
pyyaml = "^6.0.2"
|
|
11
|
+
bs4 = "^0.0.2"
|
|
12
|
+
markdown-it-py = "^3.0.0"
|
|
13
|
+
|
|
14
|
+
[tool.poetry.group.dev.dependencies]
|
|
15
|
+
python-dotenv = "^1.0.1"
|
|
16
|
+
pytest = "^8.2.2"
|
|
17
|
+
|
|
18
|
+
[build-system]
|
|
19
|
+
requires = ["poetry-core>=1.0.0"]
|
|
20
|
+
build-backend = "poetry.core.masonry.api"
|