content-extraction 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@ import subprocess
4
4
  import tempfile
5
5
  import mimetypes
6
6
  import logging
7
+ from pathlib import Path
7
8
  from urllib.parse import urlparse
8
9
 
9
10
  import requests
@@ -12,6 +13,7 @@ from content_extraction.extract_from_pptx import extract_content as extract_pptx
12
13
  from content_extraction.semantic_chunk_html import HTMLSectionParser
13
14
  from content_extraction.common_std_io import write_stream_of_obj
14
15
  from content_extraction.split_and_create_digest import process_node
16
+ import json
15
17
 
16
18
 
17
19
  logger = logging.getLogger(__name__)
@@ -236,6 +238,7 @@ def process_file(input_path: str, output_dir: str, force_ext: str = '') -> str:
236
238
  Main entry point for processing a file or URL.
237
239
  It identifies the file type, runs the appropriate handler, and returns the path to the final processed HTML file.
238
240
  """
241
+ output_dir_path = Path(output_dir)
239
242
  os.makedirs(output_dir, exist_ok=True)
240
243
  logger.info(f'[Processing File] Retrieving correct parser for "{input_path}"')
241
244
  handler = get_handler(input_path, force_ext)
@@ -265,16 +268,29 @@ def process_file(input_path: str, output_dir: str, force_ext: str = '') -> str:
265
268
  logger.info('[Processing File] Parsing HTML into sections.')
266
269
  parser = HTMLSectionParser()
267
270
  parsed_sections = parser.parse_sections(html_content)
271
+ parsed_sections_output_file = output_dir_path / 'parsed_sections.json'
272
+ with open(parsed_sections_output_file) as f:
273
+ f.write(json.dumps(parsed_sections))
268
274
 
269
275
  logger.info('[Processing File] Splitting parsed sections and creating JSON digest.')
270
- jsonl_output_path = os.path.join(output_dir, 'sections.jsonl')
271
276
 
272
277
  all_nodes = []
273
278
  if parsed_sections:
274
279
  for section in parsed_sections:
275
280
  all_nodes.extend(process_node(section, parent_digest_hash=None))
276
281
 
277
- write_stream_of_obj(all_nodes, jsonl_output_path)
282
+ jsonl_output_path = output_dir_path / 'sections.jsonl'
283
+ write_stream_of_obj(all_nodes, str(jsonl_output_path))
278
284
  logger.info(f'[Processing File] Successfully created JSON digest at {jsonl_output_path}')
279
285
 
286
+ logger.info('[Processing File] Starting to save individual chunks.')
287
+ chunks_path = output_dir_path / 'chunks'
288
+ os.makedirs(chunks_path, exist_ok=True)
289
+
290
+ for node in all_nodes:
291
+ chunk_path = chunks_path / f'{node["digest_hash"]}.json'
292
+ with open(chunk_path, 'w') as f:
293
+ json.dump(node, f)
294
+
295
+ logger.info('[Processing File] Successfully saved individual chunks.')
280
296
  return final_html_path
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env python3
1
+ from langdetect import detect
2
2
  import sys
3
3
  import argparse
4
4
  import hashlib
@@ -88,6 +88,8 @@ def process_node(node: dict, parent_digest_hash: str | None = None) -> list[dict
88
88
  """
89
89
  Recursively process a node and its subsections, returning a flat list of nodes.
90
90
  """
91
+ text = node.get('text', '')
92
+ language = detect(text)
91
93
  section_digest = generate_section_digest(node)
92
94
  digest_hash = compute_digest_hash(section_digest)
93
95
  result = ProcessResultNode(
@@ -95,8 +97,9 @@ def process_node(node: dict, parent_digest_hash: str | None = None) -> list[dict
95
97
  'digest_hash': digest_hash,
96
98
  'parent_digest_hash': parent_digest_hash,
97
99
  'title': node.get('title'),
98
- 'text': node.get('text'),
100
+ 'text': text,
99
101
  'section_digest': section_digest,
102
+ 'language': language,
100
103
  }
101
104
  )
102
105
  result = asdict(result)
@@ -1,12 +1,15 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content_extraction
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
5
5
  Requires-Python: >=3.12
6
6
  Description-Content-Type: text/markdown
7
7
  Requires-Dist: beautifulsoup4>=4.13.4
8
+ Requires-Dist: dspy>=2.6.27
9
+ Requires-Dist: langdetect>=1.0.9
8
10
  Requires-Dist: lxml>=6.0.0
9
11
  Requires-Dist: python-pptx>=1.0.2
12
+ Requires-Dist: requests>=2.32.4
10
13
 
11
14
  # HTML Content Extraction Tool
12
15
 
@@ -3,15 +3,15 @@ content_extraction/common_std_io.py,sha256=mSRaiI4OrnttEQ8Y92-LsJnAHEI3xLKnJvmXD
3
3
  content_extraction/do_ocr.py,sha256=lrqwPYQlPuUHabirH_RzKbzHgYUPPpNeHDe_u4h9LEY,6886
4
4
  content_extraction/dspy_modules.py,sha256=0aAokJQNzczfowoUNK3BPMi_U18eXM9thHvciWaE5b0,732
5
5
  content_extraction/extract_from_pptx.py,sha256=IWd81sn7ZsyaQZdXP5Cgbk7GspcDYEjMnBkti-pTHQY,6572
6
- content_extraction/file_handlers.py,sha256=fEAm0Iie5qc5ex4IhOQJGWkjZK12dfzuCabkmHF1GVM,11132
6
+ content_extraction/file_handlers.py,sha256=ppCi2A05Qns1I89jLu6gJyV2UidcY03DGjsZ8TkGXK8,11777
7
7
  content_extraction/fix_ocr.py,sha256=2xJ4c3VsGSy1l-qAukvhaV8QOp6yu5BY99Gb0DwamWQ,8009
8
8
  content_extraction/logging_config.py,sha256=GN1wuJJEspQ3z-FZIg134obsHweuiicZfz2an13a9_I,296
9
9
  content_extraction/parse_html.py,sha256=mOrZKXX59YcdWWhmbnoTnfXpwrg0znk38x0DMJIVes8,3137
10
10
  content_extraction/process.py,sha256=iLcmSjWhEg_DbgnftnVIfybIeLCuTEI57gasot0MtDk,1809
11
11
  content_extraction/process_document.sh,sha256=QbQOrV7isiEyxin1PBNGYmCbfVQ_eW-JgsbuQV4VB2o,1106
12
12
  content_extraction/semantic_chunk_html.py,sha256=iJPspKkrt95lL46JpC_9fgT8GfV8cz04TWEnU99rbBw,5786
13
- content_extraction/split_and_create_digest.py,sha256=bKZL9Axc74zLH_VrlNjd46ZiVTQQrAY5iNJCotO-8v8,4253
14
- content_extraction-0.3.0.dist-info/METADATA,sha256=j0aBHEpJ1JTarADNp-2anMD8BKpuZV5Gj45sZ8h9u4I,6201
15
- content_extraction-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
- content_extraction-0.3.0.dist-info/top_level.txt,sha256=a0I0EwSzsyd3p_aAENozn9i4I3aBn12XtrbqIvfzZec,19
17
- content_extraction-0.3.0.dist-info/RECORD,,
13
+ content_extraction/split_and_create_digest.py,sha256=vW4lyeTlRzZcqJS15g8Xqq5IZB06unrUBnQV7RrFDmA,4342
14
+ content_extraction-0.4.0.dist-info/METADATA,sha256=e2WMoLJBJrnKv77DBw2Q-4pwR-9NpzksRZe3mntcy4A,6294
15
+ content_extraction-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
+ content_extraction-0.4.0.dist-info/top_level.txt,sha256=a0I0EwSzsyd3p_aAENozn9i4I3aBn12XtrbqIvfzZec,19
17
+ content_extraction-0.4.0.dist-info/RECORD,,