PyPI - content-extraction - Versions diffs - 0.4.2__tar.gz → 0.4.4__tar.gz - Mend

content-extraction 0.4.2tar.gz → 0.4.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{content_extraction-0.4.2 → content_extraction-0.4.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: content_extraction
-Version: 0.4.2
+Version: 0.4.4
 Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
 Requires-Python: >=3.12
 Description-Content-Type: text/markdown

{content_extraction-0.4.2 → content_extraction-0.4.4}/pyproject.toml RENAMED Viewed

@@ -10,7 +10,7 @@ where = ["src"]
 [project]
 name = "content_extraction"
-version = "0.4.2"
+version = "0.4.4"
 description = "Project dedicated to content extraction from unstructured files that contain some useful information."
 readme = "README.md"
 requires-python = ">=3.12"

{content_extraction-0.4.2 → content_extraction-0.4.4}/src/content_extraction/split_and_create_digest.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from langdetect import detect
+from langdetect.lang_detect_exception import LangDetectException
 import sys
 import argparse
 import hashlib
@@ -35,6 +36,7 @@ class ProcessResultNode:
     title: str
     text: str
     section_digest: SectionDigestNode
+    language: str
 def shorten_text(text: str, max_elements: int = 2, subsections: list[dict] | None = None) -> str:
@@ -89,7 +91,11 @@ def process_node(node: dict, parent_digest_hash: str | None = None) -> list[dict
     Recursively process a node and its subsections, returning a flat list of nodes.
     """
     text = node.get('text', '')
-    language = detect(text)
+    try:
+        language = detect(text)
+    except LangDetectException:
+        logger.warning(f'Failed to detect language for {text[:128]=}')
+        language = None
     section_digest = generate_section_digest(node)
     digest_hash = compute_digest_hash(section_digest)
     result = ProcessResultNode(

{content_extraction-0.4.2 → content_extraction-0.4.4}/src/content_extraction.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: content_extraction
-Version: 0.4.2
+Version: 0.4.4
 Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
 Requires-Python: >=3.12
 Description-Content-Type: text/markdown