markdowndata 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 BYU-CS-Course-Ops
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.1
2
+ Name: markdowndata
3
+ Version: 0.0.1
4
+ Summary: Tool to convert markdown tables into json objects
5
+ License: MIT
6
+ Author: Gordon Bean
7
+ Author-email: gbean@cs.byu.edu
8
+ Requires-Python: >=3.10,<4.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Requires-Dist: bs4 (>=0.0.2,<0.0.3)
16
+ Requires-Dist: markdown-it-py (>=3.0.0,<4.0.0)
17
+ Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
@@ -0,0 +1,6 @@
1
+ from .process_markdown import MarkDataParser
2
+
3
+ def load(file):
4
+ parser = MarkDataParser()
5
+ parser.load(file.name)
6
+ return parser.data
@@ -0,0 +1,114 @@
1
+ import re
2
+ import yaml
3
+
4
+ from .utils import convert_value, get_md_soup
5
+
6
+
7
+ def detect_value_type(text: str) -> str | None:
8
+ """
9
+ Detect the type of content block (YAML, table, list, or text).
10
+ Returns the content type string or None if no match is found.
11
+ """
12
+ text = text.strip()
13
+ if not text:
14
+ return None
15
+
16
+ # YAML detection (delimited by ---)
17
+ if re.search(r'---\s*\n(.*?)\n---', text, re.DOTALL):
18
+ return 'yaml_dict'
19
+
20
+ # Convert markdown to HTML and analyze for tables, lists, or text
21
+ soup = get_md_soup(text)
22
+ if soup.find('table'):
23
+ return 'md_table'
24
+ elif soup.find('ul'):
25
+ return 'md_list'
26
+ elif soup.get_text(strip=True):
27
+ return 'md_text'
28
+
29
+ return None
30
+
31
+
32
+ def yaml_dict_parser(text: str) -> dict:
33
+ """
34
+ Parse YAML from a string (surrounded by ---) and returns it as a dictionary.
35
+ Assumes YAML is a block at the beginning of the text.
36
+ """
37
+ match = re.search(r'---\s*\n(.*?)\n---', text, re.DOTALL)
38
+ if match:
39
+ yaml_data = yaml.safe_load(match.group(1))
40
+ if yaml_data:
41
+ return {k: convert_value(v) for k, v in yaml_data.items()}
42
+ return {}
43
+
44
+
45
+ def md_table_parser(text: str) -> list[dict]:
46
+ """
47
+ Parse a Markdown table and returns it as a list of dictionaries.
48
+ Assumes the markdown is converted to HTML with <table> elements.
49
+ """
50
+ soup = get_md_soup(text)
51
+ table = soup.find('table')
52
+ if not table:
53
+ return []
54
+
55
+ # Extract headers and row data
56
+ headers = [th.get_text(strip=True) for th in table.find_all('th')]
57
+ rows = []
58
+ for tr in table.find_all('tr')[1:]: # Skip header row
59
+ cells = [convert_value(td.get_text(strip=True)) for td in tr.find_all(['td', 'th'])]
60
+ if len(cells) == len(headers):
61
+ rows.append(dict(zip(headers, cells)))
62
+
63
+ return rows
64
+
65
+
66
+ def md_list_parser(text: str) -> list:
67
+ """
68
+ Parse a Markdown list and returns it as a list of values.
69
+ Assumes the markdown is converted to HTML with <ul> elements.
70
+ """
71
+ soup = get_md_soup(text)
72
+ ul = soup.find('ul')
73
+ if not ul:
74
+ return []
75
+ return [convert_value(li.get_text(strip=True)) for li in ul.find_all('li')]
76
+
77
+
78
+ def md_text_parser(text: str) -> str:
79
+ """
80
+ Parse a Markdown text block and return its text content as a string.
81
+ Ensures lines flow together as a paragraph, not split across lines.
82
+ """
83
+ soup = get_md_soup(text)
84
+ raw_text = soup.get_text(strip=True)
85
+ return convert_value(' '.join(raw_text.splitlines()))
86
+
87
+
88
+ def parse_content_block(text: str):
89
+ """
90
+ Parse a given block of Markdown text into structured data.
91
+ Automatically detects the content type (YAML, table, list, text) and
92
+ dispatches to the appropriate parser. Raises an error if the content
93
+ cannot be parsed or returns an empty result.
94
+ """
95
+ text = text.strip()
96
+ if not text:
97
+ return {}
98
+
99
+ v_type = detect_value_type(text)
100
+ if not v_type:
101
+ raise ValueError(f'No parser found for content: {text}')
102
+
103
+ parser_functions = {
104
+ 'yaml_dict': yaml_dict_parser,
105
+ 'md_table': md_table_parser,
106
+ 'md_list': md_list_parser,
107
+ 'md_text': md_text_parser
108
+ }
109
+
110
+ parser = parser_functions[v_type]
111
+ value = parser(text)
112
+ if not value:
113
+ raise ValueError(f'Parser for {v_type} returned empty value for: {text}')
114
+ return value
@@ -0,0 +1,59 @@
1
+ from typing import Union, List, IO
2
+ from .section_tree import split_sections, build_section_tree
3
+ from .utils import Node
4
+
5
+
6
+ class MarkDataParser:
7
+ """
8
+ Parses a Markdown document into a JSON-like dictionary structure.
9
+ Builds a hierarchy of sections and converts each section's content into a structured form.
10
+ """
11
+ def __init__(self):
12
+ self.data = {}
13
+
14
+ def load(self, file: Union[str, IO]) -> dict:
15
+ """
16
+ Loads markdown content from a file path or file-like object, parses it,
17
+ and builds a nested dictionary of structured data.
18
+ """
19
+ if isinstance(file, str):
20
+ with open(file, 'r') as f:
21
+ text = f.read()
22
+ else:
23
+ text = file.read()
24
+
25
+ # Split the text into Section objects based on markdown headers
26
+ sections = split_sections(text)
27
+
28
+ # Build a hierarchical tree of sections and subsections
29
+ section_tree = build_section_tree(sections)
30
+
31
+ # Convert the section tree into a JSON-like dictionary structure
32
+ self.data = self.build_dict(section_tree)
33
+ return self.data
34
+
35
+ def build_dict(self, sections: List[Node]) -> dict:
36
+ """
37
+ Recursively converts a list of Node objects into a JSON-like dictionary structure.
38
+ """
39
+ result = {}
40
+ for node in sections:
41
+ sub_dict = self.build_dict(node.subsections)
42
+
43
+ if isinstance(node.parsed, dict):
44
+ # If the parsed content is a dictionary, merge it with its subsections
45
+ merged = {**node.parsed, **sub_dict}
46
+ elif node.subsections:
47
+ # If subsections exist but parsed content is not a dict,
48
+ # wrap both into a new dictionary
49
+ merged = {
50
+ 'content': node.parsed,
51
+ **sub_dict
52
+ } if node.parsed else sub_dict
53
+ else:
54
+ # If no subsections, return the parsed content (could be string, list, or None)
55
+ merged = node.parsed
56
+
57
+ # Use the node's title as the key in the dictionary
58
+ result[node.title] = merged
59
+ return result
@@ -0,0 +1,58 @@
1
+ import re
2
+ from .utils import Section, Node
3
+ from .content_parser import parse_content_block
4
+
5
+
6
+ def split_sections(text: str):
7
+ """
8
+ Splits the Markdown text into Section objects.
9
+ Each section is identified by a header (e.g., #, ##, ###).
10
+ """
11
+ pattern = re.compile(r'^(?P<header>#+) (?P<title>[^\n]+)', re.MULTILINE)
12
+ matches = list(pattern.finditer(text))
13
+
14
+ sections = []
15
+ for i, match in enumerate(matches):
16
+ # Calculate the 'end' of the current section:
17
+ # It's the start of the next header or the end of the document.
18
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
19
+
20
+ sections.append(Section(
21
+ title=match.group('title').strip(), # The section's title text
22
+ level=len(match.group('header')), # The number of # symbols indicates nesting level
23
+ start=match.start(), # Position where this header starts in the text
24
+ end=end, # Position where this section's content ends
25
+ content=text[match.end():end].strip() # The actual text content of this section (excluding header)
26
+ ))
27
+
28
+ return sections
29
+
30
+
31
+ def build_section_tree(sections):
32
+ """
33
+ Builds a hierarchical tree of Nodes from the list of Section objects.
34
+ Uses a stack to track the current section hierarchy.
35
+ """
36
+ root = Node(title='Root', level=0, parsed={}, subsections=[])
37
+ stack = [root]
38
+
39
+ for section in sections:
40
+ node = Node(
41
+ title=section.title,
42
+ level=section.level,
43
+ parsed=parse_content_block(section.content),
44
+ subsections=[]
45
+ )
46
+
47
+ # Find the correct parent in the hierarchy
48
+ while stack and stack[-1].level >= section.level:
49
+ stack.pop()
50
+
51
+ # Add this node as a child of the current parent
52
+ parent_node = stack[-1]
53
+ parent_node.subsections.append(node)
54
+
55
+ # Push this node to the stack (might have its own children)
56
+ stack.append(node)
57
+
58
+ return root.subsections
@@ -0,0 +1,53 @@
1
+ from typing import Union
2
+ from dataclasses import dataclass
3
+
4
+ from bs4 import BeautifulSoup
5
+ from markdown_it import MarkdownIt
6
+
7
+
8
+ @dataclass
9
+ class Section:
10
+ """
11
+ Represents a single section of the Markdown document.
12
+ Contains its title, header level, location in the original text, and raw content.
13
+ """
14
+ title: str
15
+ level: int
16
+ start: int
17
+ end: int
18
+ content: str
19
+
20
+
21
+ @dataclass
22
+ class Node:
23
+ """
24
+ Represents a node in the hierarchical section tree.
25
+ Holds the section title, header level, parsed content, and nested subsections.
26
+ """
27
+ title: str
28
+ level: int
29
+ parsed: Union[dict, list, str]
30
+ subsections: list
31
+
32
+
33
+ def get_md_soup(text: str) -> BeautifulSoup:
34
+ """
35
+ Converts a Markdown text block into HTML and parses it into a BeautifulSoup object.
36
+ """
37
+ md_parser = MarkdownIt()
38
+ md_parser.enable("table") # Enables markdown table parsing
39
+ md_parser.enable("code") # Enables markdown code parsing
40
+ html = md_parser.render(text)
41
+ return BeautifulSoup(html, 'html.parser')
42
+
43
+
44
+ def convert_value(value: str) -> Union[int, float, str]:
45
+ """
46
+ Convert a string to an int, float, or datetime object is possible, or return the original string.
47
+ """
48
+ try:
49
+ value = value.strip()
50
+ num = float(value)
51
+ return int(num) if num.is_integer() else num
52
+ except (ValueError, AttributeError):
53
+ return value
@@ -0,0 +1,20 @@
1
+ [tool.poetry]
2
+ name = "markdowndata"
3
+ version = "0.0.1"
4
+ description = "Tool to convert markdown tables into json objects"
5
+ authors = ["Gordon Bean <gbean@cs.byu.edu>", "Robert Greathouse <robbykap@byu.edu>"]
6
+ license = "MIT"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.10"
10
+ pyyaml = "^6.0.2"
11
+ bs4 = "^0.0.2"
12
+ markdown-it-py = "^3.0.0"
13
+
14
+ [tool.poetry.group.dev.dependencies]
15
+ python-dotenv = "^1.0.1"
16
+ pytest = "^8.2.2"
17
+
18
+ [build-system]
19
+ requires = ["poetry-core>=1.0.0"]
20
+ build-backend = "poetry.core.masonry.api"