toolslm 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: toolslm
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Tools to make language models a bit easier to use
5
5
  Home-page: https://github.com/AnswerDotAI/toolslm
6
6
  Author: Jeremy Howard
@@ -1,7 +1,7 @@
1
1
  [DEFAULT]
2
2
  repo = toolslm
3
3
  lib_name = toolslm
4
- version = 0.3.0
4
+ version = 0.3.2
5
5
  min_python = 3.9
6
6
  license = apache2
7
7
  black_formatting = False
@@ -0,0 +1 @@
1
+ __version__ = "0.3.2"
@@ -0,0 +1,278 @@
1
+ import re
2
+ from fastcore.utils import *
3
+ __all__ = ['create_heading_dict', 'HeadingDict']
4
+
5
+ class HeadingDict(dict):
6
+ """A dictionary-like object that also stores the markdown text content."""
7
+ def __init__(self, text="", *args, **kwargs):
8
+ super().__init__(*args, **kwargs)
9
+ self.text = text
10
+
11
+
12
+ def create_heading_dict(text, rm_fenced=True):
13
+ "Create a nested dictionary structure from markdown headings."
14
+ original_text = text
15
+ original_lines = text.splitlines()
16
+
17
+ # Use fenced-removed text only for finding headings
18
+ text_for_headings = text
19
+ if rm_fenced: text_for_headings = re.sub(r'```[\s\S]*?```', '', text)
20
+
21
+ lines_for_headings = text_for_headings.splitlines()
22
+ headings = []
23
+
24
+ # Parse headings with their levels and line numbers
25
+ for idx, line in enumerate(lines_for_headings):
26
+ match = re.match(r'^(#{1,6})\s+\S.*', line)
27
+ if match:
28
+ level = len(match.group(1))
29
+ title = line.strip('#').strip()
30
+ headings.append({'level': level, 'title': title, 'line': idx})
31
+
32
+ # Assign text content to each heading using original lines
33
+ for i, h in enumerate(headings):
34
+ start = h['line']
35
+ # Find the end index: next heading of same or higher level
36
+ for j in range(i + 1, len(headings)):
37
+ if headings[j]['level'] <= h['level']:
38
+ end = headings[j]['line']
39
+ break
40
+ else:
41
+ end = len(original_lines)
42
+ h['content'] = '\n'.join(original_lines[start:end]).strip()
43
+
44
+ # Build the nested structure
45
+ result = HeadingDict(original_text)
46
+ stack = [result]
47
+ stack_levels = [0]
48
+
49
+ for h in headings:
50
+ # Pop stack until we find the right parent level
51
+ while len(stack) > 1 and stack_levels[-1] >= h['level']:
52
+ stack.pop()
53
+ stack_levels.pop()
54
+
55
+ new_dict = HeadingDict(h['content'])
56
+ stack[-1][h['title']] = new_dict
57
+ stack.append(new_dict)
58
+ stack_levels.append(h['level'])
59
+
60
+ return result
61
+
62
+
63
+ if __name__=='__main__':
64
+ md_content = """
65
+ # User
66
+
67
+ This is the User section.
68
+
69
+ ## Tokens
70
+
71
+ Details about tokens.
72
+
73
+ ### Value
74
+
75
+ The value of tokens.
76
+
77
+ Some more details.
78
+
79
+ ## Settings
80
+
81
+ User settings information.
82
+
83
+ # Admin
84
+
85
+ Admin section.
86
+
87
+ ## Users
88
+
89
+ Admin users management.
90
+ """
91
+
92
+ result = create_heading_dict(md_content)
93
+ #for key, value in result.items(): print(f'Key: {key}\nValue:\n{value}\n{"-"*40}')
94
+
95
+ def test_empty_content():
96
+ md_content = "# Empty Heading"
97
+ result = create_heading_dict(md_content)
98
+ assert 'Empty Heading' in result
99
+ assert result['Empty Heading'].text == '# Empty Heading'
100
+ assert result.text == md_content
101
+
102
+ def test_special_characters():
103
+ md_content = "# Heading *With* Special _Characters_!\nContent under heading."
104
+ result = create_heading_dict(md_content)
105
+ assert 'Heading *With* Special _Characters_!' in result
106
+ assert result['Heading *With* Special _Characters_!'].text == '# Heading *With* Special _Characters_!\nContent under heading.'
107
+ assert result.text == md_content
108
+
109
+ def test_duplicate_headings():
110
+ md_content = "# Duplicate\n## Duplicate\n### Duplicate\nContent under duplicate headings."
111
+ result = create_heading_dict(md_content)
112
+ assert 'Duplicate' in result
113
+ assert 'Duplicate' in result['Duplicate']
114
+ assert 'Duplicate' in result['Duplicate']['Duplicate']
115
+ assert result['Duplicate']['Duplicate']['Duplicate'].text == '### Duplicate\nContent under duplicate headings.'
116
+ assert result.text == md_content
117
+
118
+ def test_no_content():
119
+ md_content = "# No Content Heading\n## Subheading"
120
+ result = create_heading_dict(md_content)
121
+ assert result['No Content Heading'].text == '# No Content Heading\n## Subheading'
122
+ assert result['No Content Heading']['Subheading'].text == '## Subheading'
123
+ assert result.text == md_content
124
+
125
+ def test_different_levels():
126
+ md_content = "### Level 3 Heading\nContent at level 3.\n# Level 1 Heading\nContent at level 1."
127
+ result = create_heading_dict(md_content)
128
+ assert 'Level 3 Heading' in result
129
+ assert 'Level 1 Heading' in result
130
+ assert result['Level 3 Heading'].text == '### Level 3 Heading\nContent at level 3.'
131
+ assert result['Level 1 Heading'].text == '# Level 1 Heading\nContent at level 1.'
132
+ assert result.text == md_content
133
+
134
+ def test_parent_includes_subheadings():
135
+ md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content."
136
+ result = create_heading_dict(md_content)
137
+ assert result['Parent'].text == '# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content.'
138
+ assert result['Parent']['Child'].text == '## Child\nChild content.\n### Grandchild\nGrandchild content.'
139
+ assert result['Parent']['Child']['Grandchild'].text == '### Grandchild\nGrandchild content.'
140
+ assert result.text == md_content
141
+
142
+ def test_multiple_level2_siblings():
143
+ md_content = "## Sib 1\n## Sib 2\n## Sib 3\n## Sib 4\n## Sib 5'"
144
+ result = create_heading_dict(md_content)
145
+ assert 'Sib 1' in result
146
+ assert 'Sib 2' in result
147
+ assert 'Sib 3' in result
148
+ assert 'Sib 4' in result
149
+ assert "Sib 5'" in result
150
+ assert result.text == md_content
151
+
152
+ def test_code_chunks_escaped():
153
+ md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
154
+ result = create_heading_dict(md_content)
155
+ assert 'Code comment' not in str(result)
156
+ assert result.text == md_content
157
+
158
+ test_empty_content()
159
+ test_special_characters()
160
+ test_duplicate_headings()
161
+ test_no_content()
162
+ test_different_levels()
163
+ test_parent_includes_subheadings()
164
+ test_multiple_level2_siblings()
165
+ test_code_chunks_escaped()
166
+ print('tests passed')
167
+
168
+ def test_nested_headings():
169
+ md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content."
170
+ result = create_heading_dict(md_content)
171
+ assert 'Child' in result['Parent']
172
+ assert 'Grandchild' in result['Parent']['Child']
173
+
174
+ def test_code_chunks_escaped():
175
+ md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
176
+ result = create_heading_dict(md_content)
177
+ assert 'Code comment' not in result
178
+
179
+ def test_fenced_blocks_preserved_in_text():
180
+ md_content = """# Section
181
+ Content before code.
182
+
183
+ ```python
184
+ # This heading should be ignored for structure
185
+ def hello():
186
+ print("Hello, world!")
187
+ ```
188
+
189
+ More content after code."""
190
+ result = create_heading_dict(md_content)
191
+ # Fenced code should be preserved in text content
192
+ assert '```python' in result['Section'].text
193
+ assert 'def hello():' in result['Section'].text
194
+ assert '```' in result['Section'].text
195
+ # But headings inside fenced blocks should not create structure
196
+ assert 'This heading should be ignored for structure' not in result['Section']
197
+
198
+ test_nested_headings()
199
+ test_code_chunks_escaped()
200
+ test_fenced_blocks_preserved_in_text()
201
+
202
+ def test_multiple_h1s():
203
+ md_content = "# First H1\n# Second H1\n# Third H1"
204
+ result = create_heading_dict(md_content)
205
+ assert 'First H1' in result
206
+ assert 'Second H1' in result
207
+ assert 'Third H1' in result
208
+ assert result['First H1'] == {}
209
+ assert result['Second H1'] == {}
210
+ assert result['Third H1'] == {}
211
+
212
+ def test_skip_levels_down():
213
+ md_content = "# Root\n## Level2\n#### Level4"
214
+ result = create_heading_dict(md_content)
215
+ assert 'Root' in result
216
+ assert 'Level2' in result['Root']
217
+ assert 'Level4' in result['Root']['Level2']
218
+
219
+ def test_skip_levels_up():
220
+ md_content = "# Root\n#### Deep\n## Back to 2"
221
+ result = create_heading_dict(md_content)
222
+ assert 'Root' in result
223
+ assert 'Deep' in result['Root']
224
+ assert 'Back to 2' in result['Root']
225
+ assert result['Root']['Deep'] == {}
226
+ assert result['Root']['Back to 2'] == {}
227
+
228
+ def test_non_h1_start():
229
+ md_content = "### Starting at 3\n## Going to 2\n# Finally 1"
230
+ result = create_heading_dict(md_content)
231
+ assert 'Starting at 3' in result
232
+ assert 'Going to 2' in result
233
+ assert 'Finally 1' in result
234
+
235
+ test_multiple_h1s()
236
+ test_skip_levels_down()
237
+ test_skip_levels_up()
238
+ test_non_h1_start()
239
+
240
+ # Edge case tests
241
+ def test_empty_input():
242
+ result = create_heading_dict("")
243
+ assert result == {}
244
+ assert result.text == ""
245
+
246
+ def test_whitespace_only():
247
+ result = create_heading_dict(" \n\t \n ")
248
+ assert result == {}
249
+ assert result.text == " \n\t \n "
250
+
251
+ def test_malformed_headings():
252
+ # Too many #s (matches max 6)
253
+ md_content = "####### Too Many\nContent"
254
+ result = create_heading_dict(md_content)
255
+ assert 'Too Many' not in result
256
+ assert result.text == md_content
257
+
258
+ def test_unicode_and_emojis():
259
+ # Unicode characters
260
+ md_content = "# Café & Naïve\nContent with unicode\n## 中文标题\nChinese content"
261
+ result = create_heading_dict(md_content)
262
+ assert 'Café & Naïve' in result
263
+ assert '中文标题' in result['Café & Naïve']
264
+ assert result.text == md_content
265
+
266
+ # Emojis
267
+ md_content = "# 🚀 Rocket Heading\nRocket content\n## 💻 Computer\nComputer content"
268
+ result = create_heading_dict(md_content)
269
+ assert '🚀 Rocket Heading' in result
270
+ assert '💻 Computer' in result['🚀 Rocket Heading']
271
+ assert result.text == md_content
272
+
273
+ test_empty_input()
274
+ test_whitespace_only()
275
+ test_malformed_headings()
276
+ test_unicode_and_emojis()
277
+ print('tests passed')
278
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: toolslm
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Tools to make language models a bit easier to use
5
5
  Home-page: https://github.com/AnswerDotAI/toolslm
6
6
  Author: Jeremy Howard
@@ -1 +0,0 @@
1
- __version__ = "0.3.0"
@@ -1,175 +0,0 @@
1
- import re
2
- from fastcore.utils import *
3
- __all__ = ['markdown_to_dict', 'create_heading_dict']
4
-
5
- def markdown_to_dict(markdown_content):
6
- def clean_heading(text): return re.sub(r'[^A-Za-z0-9 ]+', '', text).strip()
7
-
8
- lines = markdown_content.splitlines()
9
- headings = []
10
- in_code_block = False
11
-
12
- # Parse headings with their levels and line numbers
13
- for idx, line in enumerate(lines):
14
- # Toggle code block state when encountering fence
15
- if line.strip().startswith('```'): in_code_block = not in_code_block
16
-
17
- # Only detect headings when not in a code block
18
- if in_code_block: continue
19
- match = re.match(r'^(#{1,6})\s*(.*)', line)
20
- if match:
21
- level = len(match.group(1))
22
- text = match.group(2).strip()
23
- headings.append({'level': level, 'text': text, 'line': idx})
24
-
25
- # Assign content to each heading, including subheadings
26
- for i, h in enumerate(headings):
27
- start = h['line'] # Include the heading line itself
28
- # Find the end index: next heading of same or higher level
29
- for j in range(i + 1, len(headings)):
30
- if headings[j]['level'] <= h['level']:
31
- end = headings[j]['line']
32
- break
33
- else: end = len(lines)
34
- h['content'] = '\n'.join(lines[start:end]).strip()
35
-
36
- # Build the dictionary with hierarchical keys
37
- result,stack = {},[]
38
- first_level = headings[0]['level']
39
- for h in headings:
40
- stack = stack[:h['level'] - first_level] + [clean_heading(h['text'])]
41
- key = '.'.join(stack)
42
- result[key] = h['content']
43
- return dict2obj(result)
44
-
45
- def create_heading_dict(text):
46
- text = re.sub(r'```[\s\S]*?```', '', text)
47
- headings = re.findall(r'^#+.*', text, flags=re.MULTILINE)
48
- result = {}
49
- stack = [result]
50
- prev_level = 0
51
-
52
- for heading in headings:
53
- level = heading.count('#')
54
- title = heading.strip('#').strip()
55
- while level <= prev_level:
56
- stack.pop()
57
- prev_level -= 1
58
- new_dict = {}
59
- stack[-1][title] = new_dict
60
- stack.append(new_dict)
61
- prev_level = level
62
- return dict2obj(result)
63
-
64
-
65
- if __name__=='__main__':
66
- md_content = """
67
- # User
68
-
69
- This is the User section.
70
-
71
- ## Tokens
72
-
73
- Details about tokens.
74
-
75
- ### Value
76
-
77
- The value of tokens.
78
-
79
- Some more details.
80
-
81
- ## Settings
82
-
83
- User settings information.
84
-
85
- # Admin
86
-
87
- Admin section.
88
-
89
- ## Users
90
-
91
- Admin users management.
92
- """
93
-
94
- result = markdown_to_dict(md_content)
95
- #for key, value in result.items(): print(f'Key: {key}\nValue:\n{value}\n{"-"*40}')
96
-
97
- def test_empty_content():
98
- md_content = "# Empty Heading"
99
- result = markdown_to_dict(md_content)
100
- assert result['Empty Heading'] == '# Empty Heading'
101
-
102
- def test_special_characters():
103
- md_content = "# Heading *With* Special _Characters_!\nContent under heading."
104
- result = markdown_to_dict(md_content)
105
- assert 'Heading With Special Characters' in result
106
- assert result['Heading With Special Characters'] == '# Heading *With* Special _Characters_!\nContent under heading.'
107
-
108
- def test_duplicate_headings():
109
- md_content = "# Duplicate\n## Duplicate\n### Duplicate\nContent under duplicate headings."
110
- result = markdown_to_dict(md_content)
111
- assert 'Duplicate' in result
112
- assert 'Duplicate.Duplicate' in result
113
- assert 'Duplicate.Duplicate.Duplicate' in result
114
- assert result['Duplicate.Duplicate.Duplicate'] == '### Duplicate\nContent under duplicate headings.'
115
-
116
- def test_no_content():
117
- md_content = "# No Content Heading\n## Subheading"
118
- result = markdown_to_dict(md_content)
119
- assert result['No Content Heading'] == '# No Content Heading\n## Subheading'
120
- assert result['No Content Heading.Subheading'] == '## Subheading'
121
-
122
- def test_different_levels():
123
- md_content = "### Level 3 Heading\nContent at level 3.\n# Level 1 Heading\nContent at level 1."
124
- result = markdown_to_dict(md_content)
125
- assert 'Level 3 Heading' in result
126
- assert 'Level 1 Heading' in result
127
- assert result['Level 3 Heading'] == '### Level 3 Heading\nContent at level 3.'
128
- assert result['Level 1 Heading'] == '# Level 1 Heading\nContent at level 1.'
129
-
130
- def test_parent_includes_subheadings():
131
- md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content."
132
- result = markdown_to_dict(md_content)
133
- assert result['Parent'] == '# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content.'
134
- assert result['Parent.Child'] == '## Child\nChild content.\n### Grandchild\nGrandchild content.'
135
- assert result['Parent.Child.Grandchild'] == '### Grandchild\nGrandchild content.'
136
-
137
- def test_multiple_level2_siblings():
138
- md_content = "##Sib 1\n##Sib 2\n##Sib 3\n##Sib 4\n##Sib 5'"
139
- result = markdown_to_dict(md_content)
140
- assert 'Sib 1' in result
141
- assert 'Sib 2' in result
142
- assert 'Sib 3' in result
143
- assert 'Sib 4' in result
144
- assert 'Sib 5' in result
145
-
146
- def test_code_chunks_escaped():
147
- md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
148
- result = markdown_to_dict(md_content)
149
- assert 'Code comment' not in result
150
- assert "# Code comment" in result['Parent.Child']
151
-
152
- test_empty_content()
153
- test_special_characters()
154
- test_duplicate_headings()
155
- test_no_content()
156
- test_different_levels()
157
- test_parent_includes_subheadings()
158
- test_multiple_level2_siblings()
159
- test_code_chunks_escaped()
160
- print('tests passed')
161
-
162
- def test_nested_headings():
163
- md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content."
164
- result = create_heading_dict(md_content)
165
- assert 'Child' in result['Parent']
166
- assert 'Grandchild' in result['Parent']['Child']
167
-
168
- def test_code_chunks_escaped():
169
- md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
170
- result = create_heading_dict(md_content)
171
- assert 'Code comment' not in result
172
-
173
- test_nested_headings()
174
- test_code_chunks_escaped()
175
- print('tests passed')
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes