PyPI - content-extraction - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

content-extraction 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

content_extraction/file_handlers.py CHANGED Viewed

@@ -4,6 +4,7 @@ import subprocess
 import tempfile
 import mimetypes
 import logging
+from pathlib import Path
 from urllib.parse import urlparse
 import requests
@@ -12,6 +13,7 @@ from content_extraction.extract_from_pptx import extract_content as extract_pptx
 from content_extraction.semantic_chunk_html import HTMLSectionParser
 from content_extraction.common_std_io import write_stream_of_obj
 from content_extraction.split_and_create_digest import process_node
+import json
 logger = logging.getLogger(__name__)
@@ -236,6 +238,7 @@ def process_file(input_path: str, output_dir: str, force_ext: str = '') -> str:
     Main entry point for processing a file or URL.
     It identifies the file type, runs the appropriate handler, and returns the path to the final processed HTML file.
     """
+    output_dir_path = Path(output_dir)
     os.makedirs(output_dir, exist_ok=True)
     logger.info(f'[Processing File] Retrieving correct parser for "{input_path}"')
     handler = get_handler(input_path, force_ext)
@@ -265,16 +268,29 @@ def process_file(input_path: str, output_dir: str, force_ext: str = '') -> str:
     logger.info('[Processing File] Parsing HTML into sections.')
     parser = HTMLSectionParser()
     parsed_sections = parser.parse_sections(html_content)
+    parsed_sections_output_file = output_dir_path / 'parsed_sections.json'
+    with open(parsed_sections_output_file) as f:
+        f.write(json.dumps(parsed_sections))
     logger.info('[Processing File] Splitting parsed sections and creating JSON digest.')
-    jsonl_output_path = os.path.join(output_dir, 'sections.jsonl')
     all_nodes = []
     if parsed_sections:
         for section in parsed_sections:
             all_nodes.extend(process_node(section, parent_digest_hash=None))
-    write_stream_of_obj(all_nodes, jsonl_output_path)
+    jsonl_output_path = output_dir_path / 'sections.jsonl'
+    write_stream_of_obj(all_nodes, str(jsonl_output_path))
     logger.info(f'[Processing File] Successfully created JSON digest at {jsonl_output_path}')
+    logger.info('[Processing File] Starting to save individual chunks.')
+    chunks_path = output_dir_path / 'chunks'
+    os.makedirs(chunks_path, exist_ok=True)
+    for node in all_nodes:
+        chunk_path = chunks_path / f'{node["digest_hash"]}.json'
+        with open(chunk_path, 'w') as f:
+            json.dump(node, f)
+    logger.info('[Processing File] Successfully saved individual chunks.')
     return final_html_path

content_extraction/split_and_create_digest.py CHANGED Viewed

@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+from langdetect import detect
 import sys
 import argparse
 import hashlib
@@ -88,6 +88,8 @@ def process_node(node: dict, parent_digest_hash: str | None = None) -> list[dict
     """
     Recursively process a node and its subsections, returning a flat list of nodes.
     """
+    text = node.get('text', '')
+    language = detect(text)
     section_digest = generate_section_digest(node)
     digest_hash = compute_digest_hash(section_digest)
     result = ProcessResultNode(
@@ -95,8 +97,9 @@ def process_node(node: dict, parent_digest_hash: str | None = None) -> list[dict
             'digest_hash': digest_hash,
             'parent_digest_hash': parent_digest_hash,
             'title': node.get('title'),
-            'text': node.get('text'),
+            'text': text,
             'section_digest': section_digest,
+            'language': language,
         }
     )
     result = asdict(result)

{content_extraction-0.3.0.dist-info → content_extraction-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,12 +1,15 @@
 Metadata-Version: 2.4
 Name: content_extraction
-Version: 0.3.0
+Version: 0.4.0
 Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
 Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 Requires-Dist: beautifulsoup4>=4.13.4
+Requires-Dist: dspy>=2.6.27
+Requires-Dist: langdetect>=1.0.9
 Requires-Dist: lxml>=6.0.0
 Requires-Dist: python-pptx>=1.0.2
+Requires-Dist: requests>=2.32.4
 # HTML Content Extraction Tool

{content_extraction-0.3.0.dist-info → content_extraction-0.4.0.dist-info}/RECORD RENAMED Viewed

@@ -3,15 +3,15 @@ content_extraction/common_std_io.py,sha256=mSRaiI4OrnttEQ8Y92-LsJnAHEI3xLKnJvmXD
 content_extraction/do_ocr.py,sha256=lrqwPYQlPuUHabirH_RzKbzHgYUPPpNeHDe_u4h9LEY,6886
 content_extraction/dspy_modules.py,sha256=0aAokJQNzczfowoUNK3BPMi_U18eXM9thHvciWaE5b0,732
 content_extraction/extract_from_pptx.py,sha256=IWd81sn7ZsyaQZdXP5Cgbk7GspcDYEjMnBkti-pTHQY,6572
-content_extraction/file_handlers.py,sha256=fEAm0Iie5qc5ex4IhOQJGWkjZK12dfzuCabkmHF1GVM,11132
+content_extraction/file_handlers.py,sha256=ppCi2A05Qns1I89jLu6gJyV2UidcY03DGjsZ8TkGXK8,11777
 content_extraction/fix_ocr.py,sha256=2xJ4c3VsGSy1l-qAukvhaV8QOp6yu5BY99Gb0DwamWQ,8009
 content_extraction/logging_config.py,sha256=GN1wuJJEspQ3z-FZIg134obsHweuiicZfz2an13a9_I,296
 content_extraction/parse_html.py,sha256=mOrZKXX59YcdWWhmbnoTnfXpwrg0znk38x0DMJIVes8,3137
 content_extraction/process.py,sha256=iLcmSjWhEg_DbgnftnVIfybIeLCuTEI57gasot0MtDk,1809
 content_extraction/process_document.sh,sha256=QbQOrV7isiEyxin1PBNGYmCbfVQ_eW-JgsbuQV4VB2o,1106
 content_extraction/semantic_chunk_html.py,sha256=iJPspKkrt95lL46JpC_9fgT8GfV8cz04TWEnU99rbBw,5786
-content_extraction/split_and_create_digest.py,sha256=bKZL9Axc74zLH_VrlNjd46ZiVTQQrAY5iNJCotO-8v8,4253
-content_extraction-0.3.0.dist-info/METADATA,sha256=j0aBHEpJ1JTarADNp-2anMD8BKpuZV5Gj45sZ8h9u4I,6201
-content_extraction-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-content_extraction-0.3.0.dist-info/top_level.txt,sha256=a0I0EwSzsyd3p_aAENozn9i4I3aBn12XtrbqIvfzZec,19
-content_extraction-0.3.0.dist-info/RECORD,,
+content_extraction/split_and_create_digest.py,sha256=vW4lyeTlRzZcqJS15g8Xqq5IZB06unrUBnQV7RrFDmA,4342
+content_extraction-0.4.0.dist-info/METADATA,sha256=e2WMoLJBJrnKv77DBw2Q-4pwR-9NpzksRZe3mntcy4A,6294
+content_extraction-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+content_extraction-0.4.0.dist-info/top_level.txt,sha256=a0I0EwSzsyd3p_aAENozn9i4I3aBn12XtrbqIvfzZec,19
+content_extraction-0.4.0.dist-info/RECORD,,

{content_extraction-0.3.0.dist-info → content_extraction-0.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{content_extraction-0.3.0.dist-info → content_extraction-0.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

content-extraction 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

content-extraction 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl