toolslm 0.3.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: toolslm
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Tools to make language models a bit easier to use
5
5
  Home-page: https://github.com/AnswerDotAI/toolslm
6
6
  Author: Jeremy Howard
@@ -1,7 +1,7 @@
1
1
  [DEFAULT]
2
2
  repo = toolslm
3
3
  lib_name = toolslm
4
- version = 0.3.0
4
+ version = 0.3.1
5
5
  min_python = 3.9
6
6
  license = apache2
7
7
  black_formatting = False
@@ -0,0 +1 @@
1
+ __version__ = "0.3.1"
@@ -2,8 +2,11 @@ import re
2
2
  from fastcore.utils import *
3
3
  __all__ = ['markdown_to_dict', 'create_heading_dict']
4
4
 
5
- def markdown_to_dict(markdown_content):
6
- def clean_heading(text): return re.sub(r'[^A-Za-z0-9 ]+', '', text).strip()
5
+ def markdown_to_dict(
6
+ markdown_content:str # Markdown text including headings
7
+ )->AttrDict: # Dictionary with dot-separated hierarchical keys and content values
8
+ "Parse markdown content into a hierarchical dictionary with dot-separated keys."
9
+ def clean_heading(text): return re.sub(r'[.]+', '', text).strip() # Only remove dots (key separator)
7
10
 
8
11
  lines = markdown_content.splitlines()
9
12
  headings = []
@@ -13,7 +16,6 @@ def markdown_to_dict(markdown_content):
13
16
  for idx, line in enumerate(lines):
14
17
  # Toggle code block state when encountering fence
15
18
  if line.strip().startswith('```'): in_code_block = not in_code_block
16
-
17
19
  # Only detect headings when not in a code block
18
20
  if in_code_block: continue
19
21
  match = re.match(r'^(#{1,6})\s*(.*)', line)
@@ -35,6 +37,9 @@ def markdown_to_dict(markdown_content):
35
37
 
36
38
  # Build the dictionary with hierarchical keys
37
39
  result,stack = {},[]
40
+ if not headings:
41
+ return dict2obj(result)
42
+
38
43
  first_level = headings[0]['level']
39
44
  for h in headings:
40
45
  stack = stack[:h['level'] - first_level] + [clean_heading(h['text'])]
@@ -42,23 +47,28 @@ def markdown_to_dict(markdown_content):
42
47
  result[key] = h['content']
43
48
  return dict2obj(result)
44
49
 
45
- def create_heading_dict(text):
46
- text = re.sub(r'```[\s\S]*?```', '', text)
50
+ def create_heading_dict(text, rm_fenced=True):
51
+ "Create a nested dictionary structure from markdown headings."
52
+ if rm_fenced: text = re.sub(r'```[\s\S]*?```', '', text)
47
53
  headings = re.findall(r'^#+.*', text, flags=re.MULTILINE)
48
54
  result = {}
49
55
  stack = [result]
50
- prev_level = 0
56
+ stack_levels = [0] # Track the level at each stack position
51
57
 
52
58
  for heading in headings:
53
59
  level = heading.count('#')
54
60
  title = heading.strip('#').strip()
55
- while level <= prev_level:
61
+
62
+ # Pop stack until we find the right parent level
63
+ while len(stack) > 1 and stack_levels[-1] >= level:
56
64
  stack.pop()
57
- prev_level -= 1
65
+ stack_levels.pop()
66
+
58
67
  new_dict = {}
59
68
  stack[-1][title] = new_dict
60
69
  stack.append(new_dict)
61
- prev_level = level
70
+ stack_levels.append(level)
71
+
62
72
  return dict2obj(result)
63
73
 
64
74
 
@@ -102,8 +112,8 @@ Admin users management.
102
112
  def test_special_characters():
103
113
  md_content = "# Heading *With* Special _Characters_!\nContent under heading."
104
114
  result = markdown_to_dict(md_content)
105
- assert 'Heading With Special Characters' in result
106
- assert result['Heading With Special Characters'] == '# Heading *With* Special _Characters_!\nContent under heading.'
115
+ assert 'Heading *With* Special _Characters_!' in result
116
+ assert result['Heading *With* Special _Characters_!'] == '# Heading *With* Special _Characters_!\nContent under heading.'
107
117
 
108
118
  def test_duplicate_headings():
109
119
  md_content = "# Duplicate\n## Duplicate\n### Duplicate\nContent under duplicate headings."
@@ -141,8 +151,8 @@ Admin users management.
141
151
  assert 'Sib 2' in result
142
152
  assert 'Sib 3' in result
143
153
  assert 'Sib 4' in result
144
- assert 'Sib 5' in result
145
-
154
+ assert "Sib 5'" in result # Note the apostrophe is preserved
155
+
146
156
  def test_code_chunks_escaped():
147
157
  md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
148
158
  result = markdown_to_dict(md_content)
@@ -159,7 +169,7 @@ Admin users management.
159
169
  test_code_chunks_escaped()
160
170
  print('tests passed')
161
171
 
162
- def test_nested_headings():
172
+ def test_nested_headings():
163
173
  md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content."
164
174
  result = create_heading_dict(md_content)
165
175
  assert 'Child' in result['Parent']
@@ -169,7 +179,94 @@ Admin users management.
169
179
  md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
170
180
  result = create_heading_dict(md_content)
171
181
  assert 'Code comment' not in result
172
-
182
+
173
183
  test_nested_headings()
174
184
  test_code_chunks_escaped()
175
- print('tests passed')
185
+
186
+ def test_multiple_h1s():
187
+ md_content = "# First H1\n# Second H1\n# Third H1"
188
+ result = create_heading_dict(md_content)
189
+ assert 'First H1' in result
190
+ assert 'Second H1' in result
191
+ assert 'Third H1' in result
192
+ assert result['First H1'] == {}
193
+ assert result['Second H1'] == {}
194
+ assert result['Third H1'] == {}
195
+
196
+ def test_skip_levels_down():
197
+ md_content = "# Root\n## Level2\n#### Level4"
198
+ result = create_heading_dict(md_content)
199
+ assert 'Root' in result
200
+ assert 'Level2' in result['Root']
201
+ assert 'Level4' in result['Root']['Level2']
202
+
203
+ def test_skip_levels_up():
204
+ md_content = "# Root\n#### Deep\n## Back to 2"
205
+ result = create_heading_dict(md_content)
206
+ assert 'Root' in result
207
+ assert 'Deep' in result['Root']
208
+ assert 'Back to 2' in result['Root']
209
+ assert result['Root']['Deep'] == {}
210
+ assert result['Root']['Back to 2'] == {}
211
+
212
+ def test_non_h1_start():
213
+ md_content = "### Starting at 3\n## Going to 2\n# Finally 1"
214
+ result = create_heading_dict(md_content)
215
+ assert 'Starting at 3' in result
216
+ assert 'Going to 2' in result
217
+ assert 'Finally 1' in result
218
+
219
+ test_multiple_h1s()
220
+ test_skip_levels_down()
221
+ test_skip_levels_up()
222
+ test_non_h1_start()
223
+
224
+ # Critical edge case tests
225
+ def test_empty_input():
226
+ result = markdown_to_dict("")
227
+ assert result == {}
228
+ result = create_heading_dict("")
229
+ assert result == {}
230
+
231
+ def test_whitespace_only():
232
+ result = markdown_to_dict(" \n\t \n ")
233
+ assert result == {}
234
+ result = create_heading_dict(" \n\t \n ")
235
+ assert result == {}
236
+
237
+ def test_malformed_headings():
238
+ # No space after # (actually works - regex allows it)
239
+ md_content = "#NoSpace\n###AlsoNoSpace\nContent"
240
+ result = markdown_to_dict(md_content)
241
+ assert 'NoSpace' in result
242
+ assert 'NoSpace.AlsoNoSpace' in result
243
+
244
+ # Too many #s (matches max 6, extra # preserved in text)
245
+ md_content = "####### Too Many\nContent"
246
+ result = markdown_to_dict(md_content)
247
+ assert '# Too Many' in result # Extra # now preserved in heading text
248
+
249
+ # Empty heading (actually creates empty key)
250
+ md_content = "## \nContent after empty heading"
251
+ result = markdown_to_dict(md_content)
252
+ assert '' in result # Empty heading creates empty key
253
+
254
+ def test_unicode_and_emojis():
255
+ # Unicode characters
256
+ md_content = "# Café & Naïve\nContent with unicode\n## 中文标题\nChinese content"
257
+ result = markdown_to_dict(md_content)
258
+ assert 'Café & Naïve' in result
259
+ assert 'Café & Naïve.中文标题' in result
260
+
261
+ # Emojis
262
+ md_content = "# 🚀 Rocket Heading\nRocket content\n## 💻 Computer\nComputer content"
263
+ result = markdown_to_dict(md_content)
264
+ assert '🚀 Rocket Heading' in result
265
+ assert '🚀 Rocket Heading.💻 Computer' in result
266
+
267
+ test_empty_input()
268
+ test_whitespace_only()
269
+ test_malformed_headings()
270
+ test_unicode_and_emojis()
271
+ print('tests passed')
272
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: toolslm
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Tools to make language models a bit easier to use
5
5
  Home-page: https://github.com/AnswerDotAI/toolslm
6
6
  Author: Jeremy Howard
@@ -1 +0,0 @@
1
- __version__ = "0.3.0"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes