toolslm 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
toolslm/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.1"
1
+ __version__ = "0.3.2"
toolslm/md_hier.py CHANGED
@@ -1,75 +1,63 @@
1
1
  import re
2
2
  from fastcore.utils import *
3
- __all__ = ['markdown_to_dict', 'create_heading_dict']
3
+ __all__ = ['create_heading_dict', 'HeadingDict']
4
4
 
5
- def markdown_to_dict(
6
- markdown_content:str # Markdown text including headings
7
- )->AttrDict: # Dictionary with dot-separated hierarchical keys and content values
8
- "Parse markdown content into a hierarchical dictionary with dot-separated keys."
9
- def clean_heading(text): return re.sub(r'[.]+', '', text).strip() # Only remove dots (key separator)
5
+ class HeadingDict(dict):
6
+ """A dictionary-like object that also stores the markdown text content."""
7
+ def __init__(self, text="", *args, **kwargs):
8
+ super().__init__(*args, **kwargs)
9
+ self.text = text
10
10
 
11
- lines = markdown_content.splitlines()
11
+
12
+ def create_heading_dict(text, rm_fenced=True):
13
+ "Create a nested dictionary structure from markdown headings."
14
+ original_text = text
15
+ original_lines = text.splitlines()
16
+
17
+ # Use fenced-removed text only for finding headings
18
+ text_for_headings = text
19
+ if rm_fenced: text_for_headings = re.sub(r'```[\s\S]*?```', '', text)
20
+
21
+ lines_for_headings = text_for_headings.splitlines()
12
22
  headings = []
13
- in_code_block = False
14
23
 
15
24
  # Parse headings with their levels and line numbers
16
- for idx, line in enumerate(lines):
17
- # Toggle code block state when encountering fence
18
- if line.strip().startswith('```'): in_code_block = not in_code_block
19
- # Only detect headings when not in a code block
20
- if in_code_block: continue
21
- match = re.match(r'^(#{1,6})\s*(.*)', line)
25
+ for idx, line in enumerate(lines_for_headings):
26
+ match = re.match(r'^(#{1,6})\s+\S.*', line)
22
27
  if match:
23
28
  level = len(match.group(1))
24
- text = match.group(2).strip()
25
- headings.append({'level': level, 'text': text, 'line': idx})
29
+ title = line.strip('#').strip()
30
+ headings.append({'level': level, 'title': title, 'line': idx})
26
31
 
27
- # Assign content to each heading, including subheadings
32
+ # Assign text content to each heading using original lines
28
33
  for i, h in enumerate(headings):
29
- start = h['line'] # Include the heading line itself
34
+ start = h['line']
30
35
  # Find the end index: next heading of same or higher level
31
36
  for j in range(i + 1, len(headings)):
32
37
  if headings[j]['level'] <= h['level']:
33
38
  end = headings[j]['line']
34
39
  break
35
- else: end = len(lines)
36
- h['content'] = '\n'.join(lines[start:end]).strip()
37
-
38
- # Build the dictionary with hierarchical keys
39
- result,stack = {},[]
40
- if not headings:
41
- return dict2obj(result)
40
+ else:
41
+ end = len(original_lines)
42
+ h['content'] = '\n'.join(original_lines[start:end]).strip()
42
43
 
43
- first_level = headings[0]['level']
44
- for h in headings:
45
- stack = stack[:h['level'] - first_level] + [clean_heading(h['text'])]
46
- key = '.'.join(stack)
47
- result[key] = h['content']
48
- return dict2obj(result)
49
-
50
- def create_heading_dict(text, rm_fenced=True):
51
- "Create a nested dictionary structure from markdown headings."
52
- if rm_fenced: text = re.sub(r'```[\s\S]*?```', '', text)
53
- headings = re.findall(r'^#+.*', text, flags=re.MULTILINE)
54
- result = {}
44
+ # Build the nested structure
45
+ result = HeadingDict(original_text)
55
46
  stack = [result]
56
- stack_levels = [0] # Track the level at each stack position
57
-
58
- for heading in headings:
59
- level = heading.count('#')
60
- title = heading.strip('#').strip()
47
+ stack_levels = [0]
61
48
 
49
+ for h in headings:
62
50
  # Pop stack until we find the right parent level
63
- while len(stack) > 1 and stack_levels[-1] >= level:
51
+ while len(stack) > 1 and stack_levels[-1] >= h['level']:
64
52
  stack.pop()
65
53
  stack_levels.pop()
66
54
 
67
- new_dict = {}
68
- stack[-1][title] = new_dict
55
+ new_dict = HeadingDict(h['content'])
56
+ stack[-1][h['title']] = new_dict
69
57
  stack.append(new_dict)
70
- stack_levels.append(level)
58
+ stack_levels.append(h['level'])
71
59
 
72
- return dict2obj(result)
60
+ return result
73
61
 
74
62
 
75
63
  if __name__=='__main__':
@@ -101,63 +89,71 @@ Admin section.
101
89
  Admin users management.
102
90
  """
103
91
 
104
- result = markdown_to_dict(md_content)
92
+ result = create_heading_dict(md_content)
105
93
  #for key, value in result.items(): print(f'Key: {key}\nValue:\n{value}\n{"-"*40}')
106
94
 
107
95
  def test_empty_content():
108
96
  md_content = "# Empty Heading"
109
- result = markdown_to_dict(md_content)
110
- assert result['Empty Heading'] == '# Empty Heading'
97
+ result = create_heading_dict(md_content)
98
+ assert 'Empty Heading' in result
99
+ assert result['Empty Heading'].text == '# Empty Heading'
100
+ assert result.text == md_content
111
101
 
112
102
  def test_special_characters():
113
103
  md_content = "# Heading *With* Special _Characters_!\nContent under heading."
114
- result = markdown_to_dict(md_content)
104
+ result = create_heading_dict(md_content)
115
105
  assert 'Heading *With* Special _Characters_!' in result
116
- assert result['Heading *With* Special _Characters_!'] == '# Heading *With* Special _Characters_!\nContent under heading.'
106
+ assert result['Heading *With* Special _Characters_!'].text == '# Heading *With* Special _Characters_!\nContent under heading.'
107
+ assert result.text == md_content
117
108
 
118
109
  def test_duplicate_headings():
119
110
  md_content = "# Duplicate\n## Duplicate\n### Duplicate\nContent under duplicate headings."
120
- result = markdown_to_dict(md_content)
111
+ result = create_heading_dict(md_content)
121
112
  assert 'Duplicate' in result
122
- assert 'Duplicate.Duplicate' in result
123
- assert 'Duplicate.Duplicate.Duplicate' in result
124
- assert result['Duplicate.Duplicate.Duplicate'] == '### Duplicate\nContent under duplicate headings.'
113
+ assert 'Duplicate' in result['Duplicate']
114
+ assert 'Duplicate' in result['Duplicate']['Duplicate']
115
+ assert result['Duplicate']['Duplicate']['Duplicate'].text == '### Duplicate\nContent under duplicate headings.'
116
+ assert result.text == md_content
125
117
 
126
118
  def test_no_content():
127
119
  md_content = "# No Content Heading\n## Subheading"
128
- result = markdown_to_dict(md_content)
129
- assert result['No Content Heading'] == '# No Content Heading\n## Subheading'
130
- assert result['No Content Heading.Subheading'] == '## Subheading'
120
+ result = create_heading_dict(md_content)
121
+ assert result['No Content Heading'].text == '# No Content Heading\n## Subheading'
122
+ assert result['No Content Heading']['Subheading'].text == '## Subheading'
123
+ assert result.text == md_content
131
124
 
132
125
  def test_different_levels():
133
126
  md_content = "### Level 3 Heading\nContent at level 3.\n# Level 1 Heading\nContent at level 1."
134
- result = markdown_to_dict(md_content)
127
+ result = create_heading_dict(md_content)
135
128
  assert 'Level 3 Heading' in result
136
129
  assert 'Level 1 Heading' in result
137
- assert result['Level 3 Heading'] == '### Level 3 Heading\nContent at level 3.'
138
- assert result['Level 1 Heading'] == '# Level 1 Heading\nContent at level 1.'
130
+ assert result['Level 3 Heading'].text == '### Level 3 Heading\nContent at level 3.'
131
+ assert result['Level 1 Heading'].text == '# Level 1 Heading\nContent at level 1.'
132
+ assert result.text == md_content
139
133
 
140
134
  def test_parent_includes_subheadings():
141
135
  md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content."
142
- result = markdown_to_dict(md_content)
143
- assert result['Parent'] == '# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content.'
144
- assert result['Parent.Child'] == '## Child\nChild content.\n### Grandchild\nGrandchild content.'
145
- assert result['Parent.Child.Grandchild'] == '### Grandchild\nGrandchild content.'
136
+ result = create_heading_dict(md_content)
137
+ assert result['Parent'].text == '# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content.'
138
+ assert result['Parent']['Child'].text == '## Child\nChild content.\n### Grandchild\nGrandchild content.'
139
+ assert result['Parent']['Child']['Grandchild'].text == '### Grandchild\nGrandchild content.'
140
+ assert result.text == md_content
146
141
 
147
142
  def test_multiple_level2_siblings():
148
- md_content = "##Sib 1\n##Sib 2\n##Sib 3\n##Sib 4\n##Sib 5'"
149
- result = markdown_to_dict(md_content)
143
+ md_content = "## Sib 1\n## Sib 2\n## Sib 3\n## Sib 4\n## Sib 5'"
144
+ result = create_heading_dict(md_content)
150
145
  assert 'Sib 1' in result
151
146
  assert 'Sib 2' in result
152
147
  assert 'Sib 3' in result
153
148
  assert 'Sib 4' in result
154
- assert "Sib 5'" in result # Note the apostrophe is preserved
149
+ assert "Sib 5'" in result
150
+ assert result.text == md_content
155
151
 
156
152
  def test_code_chunks_escaped():
157
153
  md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
158
- result = markdown_to_dict(md_content)
159
- assert 'Code comment' not in result
160
- assert "# Code comment" in result['Parent.Child']
154
+ result = create_heading_dict(md_content)
155
+ assert 'Code comment' not in str(result)
156
+ assert result.text == md_content
161
157
 
162
158
  test_empty_content()
163
159
  test_special_characters()
@@ -180,8 +176,28 @@ Admin users management.
180
176
  result = create_heading_dict(md_content)
181
177
  assert 'Code comment' not in result
182
178
 
179
+ def test_fenced_blocks_preserved_in_text():
180
+ md_content = """# Section
181
+ Content before code.
182
+
183
+ ```python
184
+ # This heading should be ignored for structure
185
+ def hello():
186
+ print("Hello, world!")
187
+ ```
188
+
189
+ More content after code."""
190
+ result = create_heading_dict(md_content)
191
+ # Fenced code should be preserved in text content
192
+ assert '```python' in result['Section'].text
193
+ assert 'def hello():' in result['Section'].text
194
+ assert '```' in result['Section'].text
195
+ # But headings inside fenced blocks should not create structure
196
+ assert 'This heading should be ignored for structure' not in result['Section']
197
+
183
198
  test_nested_headings()
184
199
  test_code_chunks_escaped()
200
+ test_fenced_blocks_preserved_in_text()
185
201
 
186
202
  def test_multiple_h1s():
187
203
  md_content = "# First H1\n# Second H1\n# Third H1"
@@ -221,48 +237,38 @@ Admin users management.
221
237
  test_skip_levels_up()
222
238
  test_non_h1_start()
223
239
 
224
- # Critical edge case tests
240
+ # Edge case tests
225
241
  def test_empty_input():
226
- result = markdown_to_dict("")
227
- assert result == {}
228
242
  result = create_heading_dict("")
229
243
  assert result == {}
244
+ assert result.text == ""
230
245
 
231
246
  def test_whitespace_only():
232
- result = markdown_to_dict(" \n\t \n ")
233
- assert result == {}
234
247
  result = create_heading_dict(" \n\t \n ")
235
248
  assert result == {}
249
+ assert result.text == " \n\t \n "
236
250
 
237
251
  def test_malformed_headings():
238
- # No space after # (actually works - regex allows it)
239
- md_content = "#NoSpace\n###AlsoNoSpace\nContent"
240
- result = markdown_to_dict(md_content)
241
- assert 'NoSpace' in result
242
- assert 'NoSpace.AlsoNoSpace' in result
243
-
244
- # Too many #s (matches max 6, extra # preserved in text)
252
+ # Too many #s (matches max 6)
245
253
  md_content = "####### Too Many\nContent"
246
- result = markdown_to_dict(md_content)
247
- assert '# Too Many' in result # Extra # now preserved in heading text
248
-
249
- # Empty heading (actually creates empty key)
250
- md_content = "## \nContent after empty heading"
251
- result = markdown_to_dict(md_content)
252
- assert '' in result # Empty heading creates empty key
254
+ result = create_heading_dict(md_content)
255
+ assert 'Too Many' not in result
256
+ assert result.text == md_content
253
257
 
254
258
  def test_unicode_and_emojis():
255
259
  # Unicode characters
256
260
  md_content = "# Café & Naïve\nContent with unicode\n## 中文标题\nChinese content"
257
- result = markdown_to_dict(md_content)
261
+ result = create_heading_dict(md_content)
258
262
  assert 'Café & Naïve' in result
259
- assert 'Café & Naïve.中文标题' in result
263
+ assert '中文标题' in result['Café & Naïve']
264
+ assert result.text == md_content
260
265
 
261
266
  # Emojis
262
267
  md_content = "# 🚀 Rocket Heading\nRocket content\n## 💻 Computer\nComputer content"
263
- result = markdown_to_dict(md_content)
268
+ result = create_heading_dict(md_content)
264
269
  assert '🚀 Rocket Heading' in result
265
- assert '🚀 Rocket Heading.💻 Computer' in result
270
+ assert '💻 Computer' in result['🚀 Rocket Heading']
271
+ assert result.text == md_content
266
272
 
267
273
  test_empty_input()
268
274
  test_whitespace_only()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: toolslm
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Tools to make language models a bit easier to use
5
5
  Home-page: https://github.com/AnswerDotAI/toolslm
6
6
  Author: Jeremy Howard
@@ -0,0 +1,13 @@
1
+ toolslm/__init__.py,sha256=vNiWJ14r_cw5t_7UDqDQIVZvladKFGyHH2avsLpN7Vg,22
2
+ toolslm/_modidx.py,sha256=-D-B5o30VGs11gBKf96lpADVXnZhdiVEshJpLzmUnDs,4378
3
+ toolslm/download.py,sha256=g3BxUSxylC_575M7RFSJ1GI3Co3EwPDdEeWzxaf2Czk,4451
4
+ toolslm/funccall.py,sha256=7nPfbcvDRMWiVKBKMLlCOMInoUJgDs5e38ef2T7QBHY,8485
5
+ toolslm/md_hier.py,sha256=Havk9Hf0t2Xt67n_r7ZxCsS0pciR85iLcE5quShvkTg,10032
6
+ toolslm/shell.py,sha256=dGInuRKvexu21VmtZkw_0S3BGiTsbAongUG-yG4YHpc,1566
7
+ toolslm/xml.py,sha256=D665Nk7NzyZlXyXrpnIRqfK2xQ-6Gf0bCSgocjF7zik,4061
8
+ toolslm-0.3.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
9
+ toolslm-0.3.2.dist-info/METADATA,sha256=5lWEv7BWTwdd5cvXgGsQXqr0j6tk8UIcGpRTlcjV3V4,2404
10
+ toolslm-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
+ toolslm-0.3.2.dist-info/entry_points.txt,sha256=xFz0Eymlo5X7BGpaO6DI9gMxvN5A7faebzrlr8ctp5I,95
12
+ toolslm-0.3.2.dist-info/top_level.txt,sha256=4hRTrFWayz_Kz5221XjvlpCwVFrW3WPi1P0fllkTq9s,8
13
+ toolslm-0.3.2.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- toolslm/__init__.py,sha256=r4xAFihOf72W9TD-lpMi6ntWSTKTP2SlzKP1ytkjRbI,22
2
- toolslm/_modidx.py,sha256=-D-B5o30VGs11gBKf96lpADVXnZhdiVEshJpLzmUnDs,4378
3
- toolslm/download.py,sha256=g3BxUSxylC_575M7RFSJ1GI3Co3EwPDdEeWzxaf2Czk,4451
4
- toolslm/funccall.py,sha256=7nPfbcvDRMWiVKBKMLlCOMInoUJgDs5e38ef2T7QBHY,8485
5
- toolslm/md_hier.py,sha256=qvPjS3eRGcf4COnrGhdzqRF5_LGUqnu7LWixOy_280E,10076
6
- toolslm/shell.py,sha256=dGInuRKvexu21VmtZkw_0S3BGiTsbAongUG-yG4YHpc,1566
7
- toolslm/xml.py,sha256=D665Nk7NzyZlXyXrpnIRqfK2xQ-6Gf0bCSgocjF7zik,4061
8
- toolslm-0.3.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
9
- toolslm-0.3.1.dist-info/METADATA,sha256=2-AJ1GSzVATnoJ6XHCAMp85oidTS-zbxS0vH6jJfIRE,2404
10
- toolslm-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
- toolslm-0.3.1.dist-info/entry_points.txt,sha256=xFz0Eymlo5X7BGpaO6DI9gMxvN5A7faebzrlr8ctp5I,95
12
- toolslm-0.3.1.dist-info/top_level.txt,sha256=4hRTrFWayz_Kz5221XjvlpCwVFrW3WPi1P0fllkTq9s,8
13
- toolslm-0.3.1.dist-info/RECORD,,