markdowndata 0.0.2__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: markdowndata
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: Tool to convert markdown tables into json objects
5
5
  License: MIT
6
6
  Author: Gordon Bean
@@ -1,13 +1,13 @@
1
1
  import re
2
2
  import yaml
3
3
 
4
- from .utils import convert_value, get_md_soup
4
+ from .utils import convert_value, get_md_soup, is_single_tag_block, is_structural_line
5
5
 
6
6
 
7
7
  def detect_value_type(text: str) -> str | None:
8
8
  """
9
9
  Detect the type of content block (YAML, table, list, or text).
10
- Returns the content type string or None if no match is found.
10
+ Only classifies as md_table or md_list if the content is *only* that structure.
11
11
  """
12
12
  text = text.strip()
13
13
  if not text:
@@ -17,16 +17,19 @@ def detect_value_type(text: str) -> str | None:
17
17
  if re.search(r'===\s*\n(.*?)\n===', text, re.DOTALL):
18
18
  return 'yaml_dict'
19
19
 
20
- # Convert markdown to HTML and analyze for tables, lists, or text
20
+ # Convert markdown to HTML
21
21
  soup = get_md_soup(text)
22
- if soup.find('table'):
22
+
23
+ # Check for exactly one <table> and no other tags or text
24
+ if soup.find('table') and is_single_tag_block(soup, 'table'):
23
25
  return 'md_table'
24
- elif soup.find('ul'):
26
+
27
+ # Check for exactly one <ul> and no other tags or text
28
+ if soup.find('ul') and is_single_tag_block(soup, 'ul'):
25
29
  return 'md_list'
26
- elif soup.get_text(strip=True):
27
- return 'md_text'
28
30
 
29
- return None
31
+ # Fallback: process everything as Markdown text
32
+ return 'md_text'
30
33
 
31
34
 
32
35
  def yaml_dict_parser(text: str) -> dict:
@@ -77,12 +80,75 @@ def md_list_parser(text: str) -> list:
77
80
 
78
81
  def md_text_parser(text: str) -> str:
79
82
  """
80
- Parse a Markdown text block and return its text content as a string.
81
- Ensures lines flow together as a paragraph, not split across lines.
83
+ Parse Markdown text by:
84
+ - Joining lines separated by a single newline (soft breaks)
85
+ - Preserving formatting (bold, italic, headers, code, code block, etc.)
86
+ - Preserving paragraph breaks (double or more newlines)
87
+ - Preserving fenced code blocks exactly
82
88
  """
83
- soup = get_md_soup(text)
84
- raw_text = soup.get_text(strip=True)
85
- return convert_value(' '.join(raw_text.splitlines()))
89
+ process_text = process_code_blocks(text, process_soft_breaks)
90
+ return convert_value(process_text)
91
+
92
+
93
+ def process_code_blocks(text: str, non_code_callback) -> str:
94
+ """
95
+ Process Markdown text, preserving fenced code blocks and applying a transformation
96
+ only to the non-code sections using `non_code_callback`.
97
+ """
98
+ code_pattern = re.compile(r'```.*?```', re.DOTALL)
99
+ result = []
100
+ last_end = 0
101
+
102
+ for match in code_pattern.finditer(text):
103
+ start, end = match.span()
104
+ non_code_part = text[last_end:start]
105
+ code_block = match.group()
106
+
107
+ if non_code_part.strip():
108
+ result.append(non_code_callback(non_code_part))
109
+ result.append(code_block)
110
+
111
+ last_end = end
112
+
113
+ remaining = text[last_end:]
114
+ if remaining.strip():
115
+ result.append(non_code_callback(remaining))
116
+
117
+ return '\n\n'.join(result)
118
+
119
+
120
+ def process_soft_breaks(text: str) -> str:
121
+ """
122
+ Joins lines separated by a single newline, except for Markdown block elements:
123
+ - Lists
124
+ - Tables
125
+ - Blockquotes
126
+ Preserves paragraph breaks (2+ newlines).
127
+ """
128
+ lines = text.split('\n')
129
+ processed = []
130
+ paragraph_lines = []
131
+
132
+ for line in lines:
133
+ if line.strip() == '':
134
+ if paragraph_lines:
135
+ processed.append(' '.join(paragraph_lines))
136
+ paragraph_lines = []
137
+ processed.append('')
138
+ continue
139
+
140
+ if is_structural_line(line):
141
+ if paragraph_lines:
142
+ processed.append(' '.join(paragraph_lines))
143
+ paragraph_lines = []
144
+ processed.append(line)
145
+ else:
146
+ paragraph_lines.append(line.strip())
147
+
148
+ if paragraph_lines:
149
+ processed.append(' '.join(paragraph_lines))
150
+
151
+ return '\n'.join(processed)
86
152
 
87
153
 
88
154
  def parse_content_block(text: str):
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from typing import Union
2
3
  from dataclasses import dataclass
3
4
 
@@ -5,6 +6,9 @@ from bs4 import BeautifulSoup
5
6
  from markdown_it import MarkdownIt
6
7
 
7
8
 
9
+ STRUCTURAL_LINE_RE = re.compile(r'^(\s*[-*+]\s+|\s*\d+\.\s+|\|.+\||>\s*)')
10
+
11
+
8
12
  @dataclass
9
13
  class Section:
10
14
  """
@@ -41,13 +45,33 @@ def get_md_soup(text: str) -> BeautifulSoup:
41
45
  return BeautifulSoup(html, 'html.parser')
42
46
 
43
47
 
48
+ def is_single_tag_block(soup, tag_name: str) -> bool:
49
+ """
50
+ Check if the block consists of a single top-level tag (e.g. <table>, <ul>)
51
+ with no other sibling tags.
52
+ """
53
+ tags = soup.find_all(recursive=False)
54
+ return len(tags) == 1 and tags[0].name == tag_name
55
+
56
+
57
+ def is_structural_line(line: str) -> bool:
58
+ """
59
+ Returns True if the line is a Markdown block element (list, table row, blockquote).
60
+ """
61
+ return bool(STRUCTURAL_LINE_RE.match(line))
62
+
63
+
44
64
  def convert_value(value: str) -> Union[int, float, str]:
45
65
  """
46
66
  Convert a string to an int, float, or datetime object is possible, or return the original string.
47
67
  """
48
68
  try:
49
69
  value = value.strip()
50
- num = float(value)
51
- return int(num) if num.is_integer() else num
70
+ if value.isdigit():
71
+ return int(value)
72
+ elif '.' in value:
73
+ return float(value)
74
+ else:
75
+ return value
52
76
  except (ValueError, AttributeError):
53
77
  return value
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "markdowndata"
3
- version = "0.0.2"
3
+ version = "0.0.3"
4
4
  description = "Tool to convert markdown tables into json objects"
5
5
  authors = ["Gordon Bean <gbean@cs.byu.edu>", "Robert Greathouse <robbykap@byu.edu>"]
6
6
  license = "MIT"
File without changes