PyPI - content-extraction - Versions diffs - 0.3.1__tar.gz → 0.4.0__tar.gz - Mend

content-extraction 0.3.1tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{content_extraction-0.3.1 → content_extraction-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,10 +1,12 @@
 Metadata-Version: 2.4
 Name: content_extraction
-Version: 0.3.1
+Version: 0.4.0
 Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
 Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 Requires-Dist: beautifulsoup4>=4.13.4
+Requires-Dist: dspy>=2.6.27
+Requires-Dist: langdetect>=1.0.9
 Requires-Dist: lxml>=6.0.0
 Requires-Dist: python-pptx>=1.0.2
 Requires-Dist: requests>=2.32.4

{content_extraction-0.3.1 → content_extraction-0.4.0}/pyproject.toml RENAMED Viewed

@@ -10,12 +10,14 @@ where = ["src"]
 [project]
 name = "content_extraction"
-version = "0.3.1"
+version = "0.4.0"
 description = "Project dedicated to content extraction from unstructured files that contain some useful information."
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
     "beautifulsoup4>=4.13.4",
+    "dspy>=2.6.27",
+    "langdetect>=1.0.9",
     "lxml>=6.0.0",
     "python-pptx>=1.0.2",
     "requests>=2.32.4",
@@ -23,9 +25,7 @@ dependencies = [
 [dependency-groups]
 dev = [
-    "dspy>=2.6.27",
     "jupyterlab>=4.4.5",
-    "mypy>=1.17.0",
     "pre-commit>=4.2.0",
     "pyright>=1.1.403",
     "pytest>=8.4.1",

{content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/file_handlers.py RENAMED Viewed

@@ -4,6 +4,7 @@ import subprocess
 import tempfile
 import mimetypes
 import logging
+from pathlib import Path
 from urllib.parse import urlparse
 import requests
@@ -12,6 +13,7 @@ from content_extraction.extract_from_pptx import extract_content as extract_pptx
 from content_extraction.semantic_chunk_html import HTMLSectionParser
 from content_extraction.common_std_io import write_stream_of_obj
 from content_extraction.split_and_create_digest import process_node
+import json
 logger = logging.getLogger(__name__)
@@ -236,6 +238,7 @@ def process_file(input_path: str, output_dir: str, force_ext: str = '') -> str:
     Main entry point for processing a file or URL.
     It identifies the file type, runs the appropriate handler, and returns the path to the final processed HTML file.
     """
+    output_dir_path = Path(output_dir)
     os.makedirs(output_dir, exist_ok=True)
     logger.info(f'[Processing File] Retrieving correct parser for "{input_path}"')
     handler = get_handler(input_path, force_ext)
@@ -265,16 +268,29 @@ def process_file(input_path: str, output_dir: str, force_ext: str = '') -> str:
     logger.info('[Processing File] Parsing HTML into sections.')
     parser = HTMLSectionParser()
     parsed_sections = parser.parse_sections(html_content)
+    parsed_sections_output_file = output_dir_path / 'parsed_sections.json'
+    with open(parsed_sections_output_file) as f:
+        f.write(json.dumps(parsed_sections))
     logger.info('[Processing File] Splitting parsed sections and creating JSON digest.')
-    jsonl_output_path = os.path.join(output_dir, 'sections.jsonl')
     all_nodes = []
     if parsed_sections:
         for section in parsed_sections:
             all_nodes.extend(process_node(section, parent_digest_hash=None))
-    write_stream_of_obj(all_nodes, jsonl_output_path)
+    jsonl_output_path = output_dir_path / 'sections.jsonl'
+    write_stream_of_obj(all_nodes, str(jsonl_output_path))
     logger.info(f'[Processing File] Successfully created JSON digest at {jsonl_output_path}')
+    logger.info('[Processing File] Starting to save individual chunks.')
+    chunks_path = output_dir_path / 'chunks'
+    os.makedirs(chunks_path, exist_ok=True)
+    for node in all_nodes:
+        chunk_path = chunks_path / f'{node["digest_hash"]}.json'
+        with open(chunk_path, 'w') as f:
+            json.dump(node, f)
+    logger.info('[Processing File] Successfully saved individual chunks.')
     return final_html_path

{content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/split_and_create_digest.py RENAMED Viewed

@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+from langdetect import detect
 import sys
 import argparse
 import hashlib
@@ -88,6 +88,8 @@ def process_node(node: dict, parent_digest_hash: str | None = None) -> list[dict
     """
     Recursively process a node and its subsections, returning a flat list of nodes.
     """
+    text = node.get('text', '')
+    language = detect(text)
     section_digest = generate_section_digest(node)
     digest_hash = compute_digest_hash(section_digest)
     result = ProcessResultNode(
@@ -95,8 +97,9 @@ def process_node(node: dict, parent_digest_hash: str | None = None) -> list[dict
             'digest_hash': digest_hash,
             'parent_digest_hash': parent_digest_hash,
             'title': node.get('title'),
-            'text': node.get('text'),
+            'text': text,
             'section_digest': section_digest,
+            'language': language,
         }
     )
     result = asdict(result)

{content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction.egg-info/PKG-INFO RENAMED Viewed

@@ -1,10 +1,12 @@
 Metadata-Version: 2.4
 Name: content_extraction
-Version: 0.3.1
+Version: 0.4.0
 Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
 Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 Requires-Dist: beautifulsoup4>=4.13.4
+Requires-Dist: dspy>=2.6.27
+Requires-Dist: langdetect>=1.0.9
 Requires-Dist: lxml>=6.0.0
 Requires-Dist: python-pptx>=1.0.2
 Requires-Dist: requests>=2.32.4