PyPI - toolslm - Versions diffs - 0.3.0__tar.gz → 0.3.1__tar.gz - Mend

toolslm 0.3.0tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{toolslm-0.3.0/toolslm.egg-info → toolslm-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: toolslm
-Version: 0.3.0
+Version: 0.3.1
 Summary: Tools to make language models a bit easier to use
 Home-page: https://github.com/AnswerDotAI/toolslm
 Author: Jeremy Howard

{toolslm-0.3.0 → toolslm-0.3.1}/settings.ini RENAMED Viewed

@@ -1,7 +1,7 @@
 [DEFAULT]
 repo = toolslm
 lib_name = toolslm
-version = 0.3.0
+version = 0.3.1
 min_python = 3.9
 license = apache2
 black_formatting = False

toolslm-0.3.1/toolslm/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.3.1"

{toolslm-0.3.0 → toolslm-0.3.1}/toolslm/md_hier.py RENAMED Viewed

@@ -2,8 +2,11 @@ import re
 from fastcore.utils import *
 __all__ = ['markdown_to_dict', 'create_heading_dict']
-def markdown_to_dict(markdown_content):
-    def clean_heading(text): return re.sub(r'[^A-Za-z0-9 ]+', '', text).strip()
+def markdown_to_dict(
+    markdown_content:str  # Markdown text including headings
+)->AttrDict: # Dictionary with dot-separated hierarchical keys and content values
+    "Parse markdown content into a hierarchical dictionary with dot-separated keys."
+    def clean_heading(text): return re.sub(r'[.]+', '', text).strip()  # Only remove dots (key separator)
     lines = markdown_content.splitlines()
     headings = []
@@ -13,7 +16,6 @@ def markdown_to_dict(markdown_content):
     for idx, line in enumerate(lines):
         # Toggle code block state when encountering fence
         if line.strip().startswith('```'): in_code_block = not in_code_block
         # Only detect headings when not in a code block
         if in_code_block: continue
         match = re.match(r'^(#{1,6})\s*(.*)', line)
@@ -35,6 +37,9 @@ def markdown_to_dict(markdown_content):
     # Build the dictionary with hierarchical keys
     result,stack = {},[]
+    if not headings:
+        return dict2obj(result)
     first_level = headings[0]['level']
     for h in headings:
         stack = stack[:h['level'] - first_level] + [clean_heading(h['text'])]
@@ -42,23 +47,28 @@ def markdown_to_dict(markdown_content):
         result[key] = h['content']
     return dict2obj(result)
-def create_heading_dict(text):
-    text = re.sub(r'```[\s\S]*?```', '', text)
+def create_heading_dict(text, rm_fenced=True):
+    "Create a nested dictionary structure from markdown headings."
+    if rm_fenced: text = re.sub(r'```[\s\S]*?```', '', text)
     headings = re.findall(r'^#+.*', text, flags=re.MULTILINE)
     result = {}
     stack = [result]
-    prev_level = 0
+    stack_levels = [0]  # Track the level at each stack position
     for heading in headings:
         level = heading.count('#')
         title = heading.strip('#').strip()
-        while level <= prev_level:
+        # Pop stack until we find the right parent level
+        while len(stack) > 1 and stack_levels[-1] >= level:
             stack.pop()
-            prev_level -= 1
+            stack_levels.pop()
         new_dict = {}
         stack[-1][title] = new_dict
         stack.append(new_dict)
-        prev_level = level
+        stack_levels.append(level)
     return dict2obj(result)
@@ -102,8 +112,8 @@ Admin users management.
     def test_special_characters():
         md_content = "# Heading *With* Special _Characters_!\nContent under heading."
         result = markdown_to_dict(md_content)
-        assert 'Heading With Special Characters' in result
-        assert result['Heading With Special Characters'] == '# Heading *With* Special _Characters_!\nContent under heading.'
+        assert 'Heading *With* Special _Characters_!' in result
+        assert result['Heading *With* Special _Characters_!'] == '# Heading *With* Special _Characters_!\nContent under heading.'
     def test_duplicate_headings():
         md_content = "# Duplicate\n## Duplicate\n### Duplicate\nContent under duplicate headings."
@@ -141,8 +151,8 @@ Admin users management.
         assert 'Sib 2' in result
         assert 'Sib 3' in result
         assert 'Sib 4' in result
-        assert 'Sib 5' in result
+        assert "Sib 5'" in result  # Note the apostrophe is preserved
     def test_code_chunks_escaped():
         md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
         result = markdown_to_dict(md_content)
@@ -159,7 +169,7 @@ Admin users management.
     test_code_chunks_escaped()
     print('tests passed')
-    def test_nested_headings():
+    def test_nested_headings():
         md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content."
         result = create_heading_dict(md_content)
         assert 'Child' in result['Parent']
@@ -169,7 +179,94 @@ Admin users management.
         md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
         result = create_heading_dict(md_content)
         assert 'Code comment' not in result
     test_nested_headings()
     test_code_chunks_escaped()
-    print('tests passed')
+    def test_multiple_h1s():
+        md_content = "# First H1\n# Second H1\n# Third H1"
+        result = create_heading_dict(md_content)
+        assert 'First H1' in result
+        assert 'Second H1' in result
+        assert 'Third H1' in result
+        assert result['First H1'] == {}
+        assert result['Second H1'] == {}
+        assert result['Third H1'] == {}
+    def test_skip_levels_down():
+        md_content = "# Root\n## Level2\n#### Level4"
+        result = create_heading_dict(md_content)
+        assert 'Root' in result
+        assert 'Level2' in result['Root']
+        assert 'Level4' in result['Root']['Level2']
+    def test_skip_levels_up():
+        md_content = "# Root\n#### Deep\n## Back to 2"
+        result = create_heading_dict(md_content)
+        assert 'Root' in result
+        assert 'Deep' in result['Root']
+        assert 'Back to 2' in result['Root']
+        assert result['Root']['Deep'] == {}
+        assert result['Root']['Back to 2'] == {}
+    def test_non_h1_start():
+        md_content = "### Starting at 3\n## Going to 2\n# Finally 1"
+        result = create_heading_dict(md_content)
+        assert 'Starting at 3' in result
+        assert 'Going to 2' in result
+        assert 'Finally 1' in result
+    test_multiple_h1s()
+    test_skip_levels_down()
+    test_skip_levels_up()
+    test_non_h1_start()
+    # Critical edge case tests
+    def test_empty_input():
+        result = markdown_to_dict("")
+        assert result == {}
+        result = create_heading_dict("")
+        assert result == {}
+    def test_whitespace_only():
+        result = markdown_to_dict("   \n\t  \n   ")
+        assert result == {}
+        result = create_heading_dict("   \n\t  \n   ")
+        assert result == {}
+    def test_malformed_headings():
+        # No space after # (actually works - regex allows it)
+        md_content = "#NoSpace\n###AlsoNoSpace\nContent"
+        result = markdown_to_dict(md_content)
+        assert 'NoSpace' in result
+        assert 'NoSpace.AlsoNoSpace' in result
+        # Too many #s (matches max 6, extra # preserved in text)
+        md_content = "####### Too Many\nContent"
+        result = markdown_to_dict(md_content)
+        assert '# Too Many' in result  # Extra # now preserved in heading text
+        # Empty heading (actually creates empty key)
+        md_content = "##   \nContent after empty heading"
+        result = markdown_to_dict(md_content)
+        assert '' in result  # Empty heading creates empty key
+    def test_unicode_and_emojis():
+        # Unicode characters
+        md_content = "# Café & Naïve\nContent with unicode\n## 中文标题\nChinese content"
+        result = markdown_to_dict(md_content)
+        assert 'Café & Naïve' in result
+        assert 'Café & Naïve.中文标题' in result
+        # Emojis
+        md_content = "# 🚀 Rocket Heading\nRocket content\n## 💻 Computer\nComputer content"
+        result = markdown_to_dict(md_content)
+        assert '🚀 Rocket Heading' in result
+        assert '🚀 Rocket Heading.💻 Computer' in result
+    test_empty_input()
+    test_whitespace_only()
+    test_malformed_headings()
+    test_unicode_and_emojis()
+    print('tests passed')

{toolslm-0.3.0 → toolslm-0.3.1/toolslm.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: toolslm
-Version: 0.3.0
+Version: 0.3.1
 Summary: Tools to make language models a bit easier to use
 Home-page: https://github.com/AnswerDotAI/toolslm
 Author: Jeremy Howard