toolslm 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
toolslm/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.0"
1
+ __version__ = "0.3.2"
toolslm/md_hier.py CHANGED
@@ -1,65 +1,63 @@
1
1
  import re
2
2
  from fastcore.utils import *
3
- __all__ = ['markdown_to_dict', 'create_heading_dict']
3
+ __all__ = ['create_heading_dict', 'HeadingDict']
4
4
 
5
- def markdown_to_dict(markdown_content):
6
- def clean_heading(text): return re.sub(r'[^A-Za-z0-9 ]+', '', text).strip()
5
+ class HeadingDict(dict):
6
+ """A dictionary-like object that also stores the markdown text content."""
7
+ def __init__(self, text="", *args, **kwargs):
8
+ super().__init__(*args, **kwargs)
9
+ self.text = text
7
10
 
8
- lines = markdown_content.splitlines()
11
+
12
+ def create_heading_dict(text, rm_fenced=True):
13
+ "Create a nested dictionary structure from markdown headings."
14
+ original_text = text
15
+ original_lines = text.splitlines()
16
+
17
+ # Use fenced-removed text only for finding headings
18
+ text_for_headings = text
19
+ if rm_fenced: text_for_headings = re.sub(r'```[\s\S]*?```', '', text)
20
+
21
+ lines_for_headings = text_for_headings.splitlines()
9
22
  headings = []
10
- in_code_block = False
11
23
 
12
24
  # Parse headings with their levels and line numbers
13
- for idx, line in enumerate(lines):
14
- # Toggle code block state when encountering fence
15
- if line.strip().startswith('```'): in_code_block = not in_code_block
16
-
17
- # Only detect headings when not in a code block
18
- if in_code_block: continue
19
- match = re.match(r'^(#{1,6})\s*(.*)', line)
25
+ for idx, line in enumerate(lines_for_headings):
26
+ match = re.match(r'^(#{1,6})\s+\S.*', line)
20
27
  if match:
21
28
  level = len(match.group(1))
22
- text = match.group(2).strip()
23
- headings.append({'level': level, 'text': text, 'line': idx})
29
+ title = line.strip('#').strip()
30
+ headings.append({'level': level, 'title': title, 'line': idx})
24
31
 
25
- # Assign content to each heading, including subheadings
32
+ # Assign text content to each heading using original lines
26
33
  for i, h in enumerate(headings):
27
- start = h['line'] # Include the heading line itself
34
+ start = h['line']
28
35
  # Find the end index: next heading of same or higher level
29
36
  for j in range(i + 1, len(headings)):
30
37
  if headings[j]['level'] <= h['level']:
31
38
  end = headings[j]['line']
32
39
  break
33
- else: end = len(lines)
34
- h['content'] = '\n'.join(lines[start:end]).strip()
40
+ else:
41
+ end = len(original_lines)
42
+ h['content'] = '\n'.join(original_lines[start:end]).strip()
35
43
 
36
- # Build the dictionary with hierarchical keys
37
- result,stack = {},[]
38
- first_level = headings[0]['level']
39
- for h in headings:
40
- stack = stack[:h['level'] - first_level] + [clean_heading(h['text'])]
41
- key = '.'.join(stack)
42
- result[key] = h['content']
43
- return dict2obj(result)
44
-
45
- def create_heading_dict(text):
46
- text = re.sub(r'```[\s\S]*?```', '', text)
47
- headings = re.findall(r'^#+.*', text, flags=re.MULTILINE)
48
- result = {}
44
+ # Build the nested structure
45
+ result = HeadingDict(original_text)
49
46
  stack = [result]
50
- prev_level = 0
47
+ stack_levels = [0]
51
48
 
52
- for heading in headings:
53
- level = heading.count('#')
54
- title = heading.strip('#').strip()
55
- while level <= prev_level:
49
+ for h in headings:
50
+ # Pop stack until we find the right parent level
51
+ while len(stack) > 1 and stack_levels[-1] >= h['level']:
56
52
  stack.pop()
57
- prev_level -= 1
58
- new_dict = {}
59
- stack[-1][title] = new_dict
53
+ stack_levels.pop()
54
+
55
+ new_dict = HeadingDict(h['content'])
56
+ stack[-1][h['title']] = new_dict
60
57
  stack.append(new_dict)
61
- prev_level = level
62
- return dict2obj(result)
58
+ stack_levels.append(h['level'])
59
+
60
+ return result
63
61
 
64
62
 
65
63
  if __name__=='__main__':
@@ -91,63 +89,71 @@ Admin section.
91
89
  Admin users management.
92
90
  """
93
91
 
94
- result = markdown_to_dict(md_content)
92
+ result = create_heading_dict(md_content)
95
93
  #for key, value in result.items(): print(f'Key: {key}\nValue:\n{value}\n{"-"*40}')
96
94
 
97
95
  def test_empty_content():
98
96
  md_content = "# Empty Heading"
99
- result = markdown_to_dict(md_content)
100
- assert result['Empty Heading'] == '# Empty Heading'
97
+ result = create_heading_dict(md_content)
98
+ assert 'Empty Heading' in result
99
+ assert result['Empty Heading'].text == '# Empty Heading'
100
+ assert result.text == md_content
101
101
 
102
102
  def test_special_characters():
103
103
  md_content = "# Heading *With* Special _Characters_!\nContent under heading."
104
- result = markdown_to_dict(md_content)
105
- assert 'Heading With Special Characters' in result
106
- assert result['Heading With Special Characters'] == '# Heading *With* Special _Characters_!\nContent under heading.'
104
+ result = create_heading_dict(md_content)
105
+ assert 'Heading *With* Special _Characters_!' in result
106
+ assert result['Heading *With* Special _Characters_!'].text == '# Heading *With* Special _Characters_!\nContent under heading.'
107
+ assert result.text == md_content
107
108
 
108
109
  def test_duplicate_headings():
109
110
  md_content = "# Duplicate\n## Duplicate\n### Duplicate\nContent under duplicate headings."
110
- result = markdown_to_dict(md_content)
111
+ result = create_heading_dict(md_content)
111
112
  assert 'Duplicate' in result
112
- assert 'Duplicate.Duplicate' in result
113
- assert 'Duplicate.Duplicate.Duplicate' in result
114
- assert result['Duplicate.Duplicate.Duplicate'] == '### Duplicate\nContent under duplicate headings.'
113
+ assert 'Duplicate' in result['Duplicate']
114
+ assert 'Duplicate' in result['Duplicate']['Duplicate']
115
+ assert result['Duplicate']['Duplicate']['Duplicate'].text == '### Duplicate\nContent under duplicate headings.'
116
+ assert result.text == md_content
115
117
 
116
118
  def test_no_content():
117
119
  md_content = "# No Content Heading\n## Subheading"
118
- result = markdown_to_dict(md_content)
119
- assert result['No Content Heading'] == '# No Content Heading\n## Subheading'
120
- assert result['No Content Heading.Subheading'] == '## Subheading'
120
+ result = create_heading_dict(md_content)
121
+ assert result['No Content Heading'].text == '# No Content Heading\n## Subheading'
122
+ assert result['No Content Heading']['Subheading'].text == '## Subheading'
123
+ assert result.text == md_content
121
124
 
122
125
  def test_different_levels():
123
126
  md_content = "### Level 3 Heading\nContent at level 3.\n# Level 1 Heading\nContent at level 1."
124
- result = markdown_to_dict(md_content)
127
+ result = create_heading_dict(md_content)
125
128
  assert 'Level 3 Heading' in result
126
129
  assert 'Level 1 Heading' in result
127
- assert result['Level 3 Heading'] == '### Level 3 Heading\nContent at level 3.'
128
- assert result['Level 1 Heading'] == '# Level 1 Heading\nContent at level 1.'
130
+ assert result['Level 3 Heading'].text == '### Level 3 Heading\nContent at level 3.'
131
+ assert result['Level 1 Heading'].text == '# Level 1 Heading\nContent at level 1.'
132
+ assert result.text == md_content
129
133
 
130
134
  def test_parent_includes_subheadings():
131
135
  md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content."
132
- result = markdown_to_dict(md_content)
133
- assert result['Parent'] == '# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content.'
134
- assert result['Parent.Child'] == '## Child\nChild content.\n### Grandchild\nGrandchild content.'
135
- assert result['Parent.Child.Grandchild'] == '### Grandchild\nGrandchild content.'
136
+ result = create_heading_dict(md_content)
137
+ assert result['Parent'].text == '# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content.'
138
+ assert result['Parent']['Child'].text == '## Child\nChild content.\n### Grandchild\nGrandchild content.'
139
+ assert result['Parent']['Child']['Grandchild'].text == '### Grandchild\nGrandchild content.'
140
+ assert result.text == md_content
136
141
 
137
142
  def test_multiple_level2_siblings():
138
- md_content = "##Sib 1\n##Sib 2\n##Sib 3\n##Sib 4\n##Sib 5'"
139
- result = markdown_to_dict(md_content)
143
+ md_content = "## Sib 1\n## Sib 2\n## Sib 3\n## Sib 4\n## Sib 5'"
144
+ result = create_heading_dict(md_content)
140
145
  assert 'Sib 1' in result
141
146
  assert 'Sib 2' in result
142
147
  assert 'Sib 3' in result
143
148
  assert 'Sib 4' in result
144
- assert 'Sib 5' in result
145
-
149
+ assert "Sib 5'" in result
150
+ assert result.text == md_content
151
+
146
152
  def test_code_chunks_escaped():
147
153
  md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
148
- result = markdown_to_dict(md_content)
149
- assert 'Code comment' not in result
150
- assert "# Code comment" in result['Parent.Child']
154
+ result = create_heading_dict(md_content)
155
+ assert 'Code comment' not in str(result)
156
+ assert result.text == md_content
151
157
 
152
158
  test_empty_content()
153
159
  test_special_characters()
@@ -159,7 +165,7 @@ Admin users management.
159
165
  test_code_chunks_escaped()
160
166
  print('tests passed')
161
167
 
162
- def test_nested_headings():
168
+ def test_nested_headings():
163
169
  md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content."
164
170
  result = create_heading_dict(md_content)
165
171
  assert 'Child' in result['Parent']
@@ -169,7 +175,104 @@ Admin users management.
169
175
  md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
170
176
  result = create_heading_dict(md_content)
171
177
  assert 'Code comment' not in result
172
-
178
+
179
+ def test_fenced_blocks_preserved_in_text():
180
+ md_content = """# Section
181
+ Content before code.
182
+
183
+ ```python
184
+ # This heading should be ignored for structure
185
+ def hello():
186
+ print("Hello, world!")
187
+ ```
188
+
189
+ More content after code."""
190
+ result = create_heading_dict(md_content)
191
+ # Fenced code should be preserved in text content
192
+ assert '```python' in result['Section'].text
193
+ assert 'def hello():' in result['Section'].text
194
+ assert '```' in result['Section'].text
195
+ # But headings inside fenced blocks should not create structure
196
+ assert 'This heading should be ignored for structure' not in result['Section']
197
+
173
198
  test_nested_headings()
174
199
  test_code_chunks_escaped()
175
- print('tests passed')
200
+ test_fenced_blocks_preserved_in_text()
201
+
202
+ def test_multiple_h1s():
203
+ md_content = "# First H1\n# Second H1\n# Third H1"
204
+ result = create_heading_dict(md_content)
205
+ assert 'First H1' in result
206
+ assert 'Second H1' in result
207
+ assert 'Third H1' in result
208
+ assert result['First H1'] == {}
209
+ assert result['Second H1'] == {}
210
+ assert result['Third H1'] == {}
211
+
212
+ def test_skip_levels_down():
213
+ md_content = "# Root\n## Level2\n#### Level4"
214
+ result = create_heading_dict(md_content)
215
+ assert 'Root' in result
216
+ assert 'Level2' in result['Root']
217
+ assert 'Level4' in result['Root']['Level2']
218
+
219
+ def test_skip_levels_up():
220
+ md_content = "# Root\n#### Deep\n## Back to 2"
221
+ result = create_heading_dict(md_content)
222
+ assert 'Root' in result
223
+ assert 'Deep' in result['Root']
224
+ assert 'Back to 2' in result['Root']
225
+ assert result['Root']['Deep'] == {}
226
+ assert result['Root']['Back to 2'] == {}
227
+
228
+ def test_non_h1_start():
229
+ md_content = "### Starting at 3\n## Going to 2\n# Finally 1"
230
+ result = create_heading_dict(md_content)
231
+ assert 'Starting at 3' in result
232
+ assert 'Going to 2' in result
233
+ assert 'Finally 1' in result
234
+
235
+ test_multiple_h1s()
236
+ test_skip_levels_down()
237
+ test_skip_levels_up()
238
+ test_non_h1_start()
239
+
240
+ # Edge case tests
241
+ def test_empty_input():
242
+ result = create_heading_dict("")
243
+ assert result == {}
244
+ assert result.text == ""
245
+
246
+ def test_whitespace_only():
247
+ result = create_heading_dict(" \n\t \n ")
248
+ assert result == {}
249
+ assert result.text == " \n\t \n "
250
+
251
+ def test_malformed_headings():
252
+ # Too many #s (matches max 6)
253
+ md_content = "####### Too Many\nContent"
254
+ result = create_heading_dict(md_content)
255
+ assert 'Too Many' not in result
256
+ assert result.text == md_content
257
+
258
+ def test_unicode_and_emojis():
259
+ # Unicode characters
260
+ md_content = "# Café & Naïve\nContent with unicode\n## 中文标题\nChinese content"
261
+ result = create_heading_dict(md_content)
262
+ assert 'Café & Naïve' in result
263
+ assert '中文标题' in result['Café & Naïve']
264
+ assert result.text == md_content
265
+
266
+ # Emojis
267
+ md_content = "# 🚀 Rocket Heading\nRocket content\n## 💻 Computer\nComputer content"
268
+ result = create_heading_dict(md_content)
269
+ assert '🚀 Rocket Heading' in result
270
+ assert '💻 Computer' in result['🚀 Rocket Heading']
271
+ assert result.text == md_content
272
+
273
+ test_empty_input()
274
+ test_whitespace_only()
275
+ test_malformed_headings()
276
+ test_unicode_and_emojis()
277
+ print('tests passed')
278
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: toolslm
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Tools to make language models a bit easier to use
5
5
  Home-page: https://github.com/AnswerDotAI/toolslm
6
6
  Author: Jeremy Howard
@@ -0,0 +1,13 @@
1
+ toolslm/__init__.py,sha256=vNiWJ14r_cw5t_7UDqDQIVZvladKFGyHH2avsLpN7Vg,22
2
+ toolslm/_modidx.py,sha256=-D-B5o30VGs11gBKf96lpADVXnZhdiVEshJpLzmUnDs,4378
3
+ toolslm/download.py,sha256=g3BxUSxylC_575M7RFSJ1GI3Co3EwPDdEeWzxaf2Czk,4451
4
+ toolslm/funccall.py,sha256=7nPfbcvDRMWiVKBKMLlCOMInoUJgDs5e38ef2T7QBHY,8485
5
+ toolslm/md_hier.py,sha256=Havk9Hf0t2Xt67n_r7ZxCsS0pciR85iLcE5quShvkTg,10032
6
+ toolslm/shell.py,sha256=dGInuRKvexu21VmtZkw_0S3BGiTsbAongUG-yG4YHpc,1566
7
+ toolslm/xml.py,sha256=D665Nk7NzyZlXyXrpnIRqfK2xQ-6Gf0bCSgocjF7zik,4061
8
+ toolslm-0.3.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
9
+ toolslm-0.3.2.dist-info/METADATA,sha256=5lWEv7BWTwdd5cvXgGsQXqr0j6tk8UIcGpRTlcjV3V4,2404
10
+ toolslm-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
+ toolslm-0.3.2.dist-info/entry_points.txt,sha256=xFz0Eymlo5X7BGpaO6DI9gMxvN5A7faebzrlr8ctp5I,95
12
+ toolslm-0.3.2.dist-info/top_level.txt,sha256=4hRTrFWayz_Kz5221XjvlpCwVFrW3WPi1P0fllkTq9s,8
13
+ toolslm-0.3.2.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- toolslm/__init__.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
2
- toolslm/_modidx.py,sha256=-D-B5o30VGs11gBKf96lpADVXnZhdiVEshJpLzmUnDs,4378
3
- toolslm/download.py,sha256=g3BxUSxylC_575M7RFSJ1GI3Co3EwPDdEeWzxaf2Czk,4451
4
- toolslm/funccall.py,sha256=7nPfbcvDRMWiVKBKMLlCOMInoUJgDs5e38ef2T7QBHY,8485
5
- toolslm/md_hier.py,sha256=4uC12443tPBduYJgIZZIcEat2VG0x7JYC8-SwDdS2JY,6360
6
- toolslm/shell.py,sha256=dGInuRKvexu21VmtZkw_0S3BGiTsbAongUG-yG4YHpc,1566
7
- toolslm/xml.py,sha256=D665Nk7NzyZlXyXrpnIRqfK2xQ-6Gf0bCSgocjF7zik,4061
8
- toolslm-0.3.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
9
- toolslm-0.3.0.dist-info/METADATA,sha256=JmN3o1_BAvUgUWi7q8j8uxpweyQzG6qTMt2u_NgASdU,2404
10
- toolslm-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
- toolslm-0.3.0.dist-info/entry_points.txt,sha256=xFz0Eymlo5X7BGpaO6DI9gMxvN5A7faebzrlr8ctp5I,95
12
- toolslm-0.3.0.dist-info/top_level.txt,sha256=4hRTrFWayz_Kz5221XjvlpCwVFrW3WPi1P0fllkTq9s,8
13
- toolslm-0.3.0.dist-info/RECORD,,