markdowndata 0.0.1__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: markdowndata
3
- Version: 0.0.1
3
+ Version: 0.0.3
4
4
  Summary: Tool to convert markdown tables into json objects
5
5
  License: MIT
6
6
  Author: Gordon Bean
@@ -0,0 +1,180 @@
1
+ import re
2
+ import yaml
3
+
4
+ from .utils import convert_value, get_md_soup, is_single_tag_block, is_structural_line
5
+
6
+
7
+ def detect_value_type(text: str) -> str | None:
8
+ """
9
+ Detect the type of content block (YAML, table, list, or text).
10
+ Only classifies as md_table or md_list if the content is *only* that structure.
11
+ """
12
+ text = text.strip()
13
+ if not text:
14
+ return None
15
+
16
+ # YAML detection (delimited by ===)
17
+ if re.search(r'===\s*\n(.*?)\n===', text, re.DOTALL):
18
+ return 'yaml_dict'
19
+
20
+ # Convert markdown to HTML
21
+ soup = get_md_soup(text)
22
+
23
+ # Check for exactly one <table> and no other tags or text
24
+ if soup.find('table') and is_single_tag_block(soup, 'table'):
25
+ return 'md_table'
26
+
27
+ # Check for exactly one <ul> and no other tags or text
28
+ if soup.find('ul') and is_single_tag_block(soup, 'ul'):
29
+ return 'md_list'
30
+
31
+ # Fallback: process everything as Markdown text
32
+ return 'md_text'
33
+
34
+
35
+ def yaml_dict_parser(text: str) -> dict:
36
+ """
37
+ Parse YAML from a string (surrounded by ===) and returns it as a dictionary.
38
+ Assumes YAML is a block at the beginning of the text.
39
+ """
40
+ match = re.search(r'===\s*\n(.*?)\n===', text, re.DOTALL)
41
+ if match:
42
+ yaml_data = yaml.safe_load(match.group(1))
43
+ if yaml_data:
44
+ return {k: convert_value(v) for k, v in yaml_data.items()}
45
+ return {}
46
+
47
+
48
+ def md_table_parser(text: str) -> list[dict]:
49
+ """
50
+ Parse a Markdown table and returns it as a list of dictionaries.
51
+ Assumes the markdown is converted to HTML with <table> elements.
52
+ """
53
+ soup = get_md_soup(text)
54
+ table = soup.find('table')
55
+ if not table:
56
+ return []
57
+
58
+ # Extract headers and row data
59
+ headers = [th.get_text(strip=True) for th in table.find_all('th')]
60
+ rows = []
61
+ for tr in table.find_all('tr')[1:]: # Skip header row
62
+ cells = [convert_value(td.get_text(strip=True)) for td in tr.find_all(['td', 'th'])]
63
+ if len(cells) == len(headers):
64
+ rows.append(dict(zip(headers, cells)))
65
+
66
+ return rows
67
+
68
+
69
+ def md_list_parser(text: str) -> list:
70
+ """
71
+ Parse a Markdown list and returns it as a list of values.
72
+ Assumes the markdown is converted to HTML with <ul> elements.
73
+ """
74
+ soup = get_md_soup(text)
75
+ ul = soup.find('ul')
76
+ if not ul:
77
+ return []
78
+ return [convert_value(li.get_text(strip=True)) for li in ul.find_all('li')]
79
+
80
+
81
+ def md_text_parser(text: str) -> str:
82
+ """
83
+ Parse Markdown text by:
84
+ - Joining lines separated by a single newline (soft breaks)
85
+ - Preserving formatting (bold, italic, headers, code, code block, etc.)
86
+ - Preserving paragraph breaks (double or more newlines)
87
+ - Preserving fenced code blocks exactly
88
+ """
89
+ process_text = process_code_blocks(text, process_soft_breaks)
90
+ return convert_value(process_text)
91
+
92
+
93
+ def process_code_blocks(text: str, non_code_callback) -> str:
94
+ """
95
+ Process Markdown text, preserving fenced code blocks and applying a transformation
96
+ only to the non-code sections using `non_code_callback`.
97
+ """
98
+ code_pattern = re.compile(r'```.*?```', re.DOTALL)
99
+ result = []
100
+ last_end = 0
101
+
102
+ for match in code_pattern.finditer(text):
103
+ start, end = match.span()
104
+ non_code_part = text[last_end:start]
105
+ code_block = match.group()
106
+
107
+ if non_code_part.strip():
108
+ result.append(non_code_callback(non_code_part))
109
+ result.append(code_block)
110
+
111
+ last_end = end
112
+
113
+ remaining = text[last_end:]
114
+ if remaining.strip():
115
+ result.append(non_code_callback(remaining))
116
+
117
+ return '\n\n'.join(result)
118
+
119
+
120
+ def process_soft_breaks(text: str) -> str:
121
+ """
122
+ Joins lines separated by a single newline, except for Markdown block elements:
123
+ - Lists
124
+ - Tables
125
+ - Blockquotes
126
+ Preserves paragraph breaks (2+ newlines).
127
+ """
128
+ lines = text.split('\n')
129
+ processed = []
130
+ paragraph_lines = []
131
+
132
+ for line in lines:
133
+ if line.strip() == '':
134
+ if paragraph_lines:
135
+ processed.append(' '.join(paragraph_lines))
136
+ paragraph_lines = []
137
+ processed.append('')
138
+ continue
139
+
140
+ if is_structural_line(line):
141
+ if paragraph_lines:
142
+ processed.append(' '.join(paragraph_lines))
143
+ paragraph_lines = []
144
+ processed.append(line)
145
+ else:
146
+ paragraph_lines.append(line.strip())
147
+
148
+ if paragraph_lines:
149
+ processed.append(' '.join(paragraph_lines))
150
+
151
+ return '\n'.join(processed)
152
+
153
+
154
+ def parse_content_block(text: str):
155
+ """
156
+ Parse a given block of Markdown text into structured data.
157
+ Automatically detects the content type (YAML, table, list, text) and
158
+ dispatches to the appropriate parser. Raises an error if the content
159
+ cannot be parsed or returns an empty result.
160
+ """
161
+ text = text.strip()
162
+ if not text:
163
+ return {}
164
+
165
+ v_type = detect_value_type(text)
166
+ if not v_type:
167
+ raise ValueError(f'No parser found for content: {text}')
168
+
169
+ parser_functions = {
170
+ 'yaml_dict': yaml_dict_parser,
171
+ 'md_table': md_table_parser,
172
+ 'md_list': md_list_parser,
173
+ 'md_text': md_text_parser
174
+ }
175
+
176
+ parser = parser_functions[v_type]
177
+ value = parser(text)
178
+ if not value:
179
+ raise ValueError(f'Parser for {v_type} returned empty value for: {text}')
180
+ return value
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from typing import Union
2
3
  from dataclasses import dataclass
3
4
 
@@ -5,6 +6,9 @@ from bs4 import BeautifulSoup
5
6
  from markdown_it import MarkdownIt
6
7
 
7
8
 
9
+ STRUCTURAL_LINE_RE = re.compile(r'^(\s*[-*+]\s+|\s*\d+\.\s+|\|.+\||>\s*)')
10
+
11
+
8
12
  @dataclass
9
13
  class Section:
10
14
  """
@@ -41,13 +45,33 @@ def get_md_soup(text: str) -> BeautifulSoup:
41
45
  return BeautifulSoup(html, 'html.parser')
42
46
 
43
47
 
48
+ def is_single_tag_block(soup, tag_name: str) -> bool:
49
+ """
50
+ Check if the block consists of a single top-level tag (e.g. <table>, <ul>)
51
+ with no other sibling tags.
52
+ """
53
+ tags = soup.find_all(recursive=False)
54
+ return len(tags) == 1 and tags[0].name == tag_name
55
+
56
+
57
+ def is_structural_line(line: str) -> bool:
58
+ """
59
+ Returns True if the line is a Markdown block element (list, table row, blockquote).
60
+ """
61
+ return bool(STRUCTURAL_LINE_RE.match(line))
62
+
63
+
44
64
  def convert_value(value: str) -> Union[int, float, str]:
45
65
  """
46
66
  Convert a string to an int, float, or datetime object is possible, or return the original string.
47
67
  """
48
68
  try:
49
69
  value = value.strip()
50
- num = float(value)
51
- return int(num) if num.is_integer() else num
70
+ if value.isdigit():
71
+ return int(value)
72
+ elif '.' in value:
73
+ return float(value)
74
+ else:
75
+ return value
52
76
  except (ValueError, AttributeError):
53
77
  return value
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "markdowndata"
3
- version = "0.0.1"
3
+ version = "0.0.3"
4
4
  description = "Tool to convert markdown tables into json objects"
5
5
  authors = ["Gordon Bean <gbean@cs.byu.edu>", "Robert Greathouse <robbykap@byu.edu>"]
6
6
  license = "MIT"
@@ -1,114 +0,0 @@
1
- import re
2
- import yaml
3
-
4
- from .utils import convert_value, get_md_soup
5
-
6
-
7
- def detect_value_type(text: str) -> str | None:
8
- """
9
- Detect the type of content block (YAML, table, list, or text).
10
- Returns the content type string or None if no match is found.
11
- """
12
- text = text.strip()
13
- if not text:
14
- return None
15
-
16
- # YAML detection (delimited by ---)
17
- if re.search(r'---\s*\n(.*?)\n---', text, re.DOTALL):
18
- return 'yaml_dict'
19
-
20
- # Convert markdown to HTML and analyze for tables, lists, or text
21
- soup = get_md_soup(text)
22
- if soup.find('table'):
23
- return 'md_table'
24
- elif soup.find('ul'):
25
- return 'md_list'
26
- elif soup.get_text(strip=True):
27
- return 'md_text'
28
-
29
- return None
30
-
31
-
32
- def yaml_dict_parser(text: str) -> dict:
33
- """
34
- Parse YAML from a string (surrounded by ---) and returns it as a dictionary.
35
- Assumes YAML is a block at the beginning of the text.
36
- """
37
- match = re.search(r'---\s*\n(.*?)\n---', text, re.DOTALL)
38
- if match:
39
- yaml_data = yaml.safe_load(match.group(1))
40
- if yaml_data:
41
- return {k: convert_value(v) for k, v in yaml_data.items()}
42
- return {}
43
-
44
-
45
- def md_table_parser(text: str) -> list[dict]:
46
- """
47
- Parse a Markdown table and returns it as a list of dictionaries.
48
- Assumes the markdown is converted to HTML with <table> elements.
49
- """
50
- soup = get_md_soup(text)
51
- table = soup.find('table')
52
- if not table:
53
- return []
54
-
55
- # Extract headers and row data
56
- headers = [th.get_text(strip=True) for th in table.find_all('th')]
57
- rows = []
58
- for tr in table.find_all('tr')[1:]: # Skip header row
59
- cells = [convert_value(td.get_text(strip=True)) for td in tr.find_all(['td', 'th'])]
60
- if len(cells) == len(headers):
61
- rows.append(dict(zip(headers, cells)))
62
-
63
- return rows
64
-
65
-
66
- def md_list_parser(text: str) -> list:
67
- """
68
- Parse a Markdown list and returns it as a list of values.
69
- Assumes the markdown is converted to HTML with <ul> elements.
70
- """
71
- soup = get_md_soup(text)
72
- ul = soup.find('ul')
73
- if not ul:
74
- return []
75
- return [convert_value(li.get_text(strip=True)) for li in ul.find_all('li')]
76
-
77
-
78
- def md_text_parser(text: str) -> str:
79
- """
80
- Parse a Markdown text block and return its text content as a string.
81
- Ensures lines flow together as a paragraph, not split across lines.
82
- """
83
- soup = get_md_soup(text)
84
- raw_text = soup.get_text(strip=True)
85
- return convert_value(' '.join(raw_text.splitlines()))
86
-
87
-
88
- def parse_content_block(text: str):
89
- """
90
- Parse a given block of Markdown text into structured data.
91
- Automatically detects the content type (YAML, table, list, text) and
92
- dispatches to the appropriate parser. Raises an error if the content
93
- cannot be parsed or returns an empty result.
94
- """
95
- text = text.strip()
96
- if not text:
97
- return {}
98
-
99
- v_type = detect_value_type(text)
100
- if not v_type:
101
- raise ValueError(f'No parser found for content: {text}')
102
-
103
- parser_functions = {
104
- 'yaml_dict': yaml_dict_parser,
105
- 'md_table': md_table_parser,
106
- 'md_list': md_list_parser,
107
- 'md_text': md_text_parser
108
- }
109
-
110
- parser = parser_functions[v_type]
111
- value = parser(text)
112
- if not value:
113
- raise ValueError(f'Parser for {v_type} returned empty value for: {text}')
114
- return value
File without changes