content-extraction 0.3.1__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {content_extraction-0.3.1 → content_extraction-0.4.0}/PKG-INFO +3 -1
- {content_extraction-0.3.1 → content_extraction-0.4.0}/pyproject.toml +3 -3
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/file_handlers.py +18 -2
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/split_and_create_digest.py +5 -2
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction.egg-info/PKG-INFO +3 -1
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction.egg-info/requires.txt +2 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/README.md +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/setup.cfg +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/__init__.py +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/common_std_io.py +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/do_ocr.py +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/dspy_modules.py +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/extract_from_pptx.py +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/fix_ocr.py +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/logging_config.py +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/parse_html.py +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/process.py +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/process_document.sh +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/semantic_chunk_html.py +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction.egg-info/SOURCES.txt +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction.egg-info/dependency_links.txt +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction.egg-info/top_level.txt +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/tests/test_section_parser.py +0 -0
- {content_extraction-0.3.1 → content_extraction-0.4.0}/tests/test_semantic_chunk_html.py +0 -0
@@ -1,10 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: content_extraction
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.0
|
4
4
|
Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
|
5
5
|
Requires-Python: >=3.12
|
6
6
|
Description-Content-Type: text/markdown
|
7
7
|
Requires-Dist: beautifulsoup4>=4.13.4
|
8
|
+
Requires-Dist: dspy>=2.6.27
|
9
|
+
Requires-Dist: langdetect>=1.0.9
|
8
10
|
Requires-Dist: lxml>=6.0.0
|
9
11
|
Requires-Dist: python-pptx>=1.0.2
|
10
12
|
Requires-Dist: requests>=2.32.4
|
@@ -10,12 +10,14 @@ where = ["src"]
|
|
10
10
|
|
11
11
|
[project]
|
12
12
|
name = "content_extraction"
|
13
|
-
version = "0.
|
13
|
+
version = "0.4.0"
|
14
14
|
description = "Project dedicated to content extraction from unstructured files that contain some useful information."
|
15
15
|
readme = "README.md"
|
16
16
|
requires-python = ">=3.12"
|
17
17
|
dependencies = [
|
18
18
|
"beautifulsoup4>=4.13.4",
|
19
|
+
"dspy>=2.6.27",
|
20
|
+
"langdetect>=1.0.9",
|
19
21
|
"lxml>=6.0.0",
|
20
22
|
"python-pptx>=1.0.2",
|
21
23
|
"requests>=2.32.4",
|
@@ -23,9 +25,7 @@ dependencies = [
|
|
23
25
|
|
24
26
|
[dependency-groups]
|
25
27
|
dev = [
|
26
|
-
"dspy>=2.6.27",
|
27
28
|
"jupyterlab>=4.4.5",
|
28
|
-
"mypy>=1.17.0",
|
29
29
|
"pre-commit>=4.2.0",
|
30
30
|
"pyright>=1.1.403",
|
31
31
|
"pytest>=8.4.1",
|
{content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/file_handlers.py
RENAMED
@@ -4,6 +4,7 @@ import subprocess
|
|
4
4
|
import tempfile
|
5
5
|
import mimetypes
|
6
6
|
import logging
|
7
|
+
from pathlib import Path
|
7
8
|
from urllib.parse import urlparse
|
8
9
|
|
9
10
|
import requests
|
@@ -12,6 +13,7 @@ from content_extraction.extract_from_pptx import extract_content as extract_pptx
|
|
12
13
|
from content_extraction.semantic_chunk_html import HTMLSectionParser
|
13
14
|
from content_extraction.common_std_io import write_stream_of_obj
|
14
15
|
from content_extraction.split_and_create_digest import process_node
|
16
|
+
import json
|
15
17
|
|
16
18
|
|
17
19
|
logger = logging.getLogger(__name__)
|
@@ -236,6 +238,7 @@ def process_file(input_path: str, output_dir: str, force_ext: str = '') -> str:
|
|
236
238
|
Main entry point for processing a file or URL.
|
237
239
|
It identifies the file type, runs the appropriate handler, and returns the path to the final processed HTML file.
|
238
240
|
"""
|
241
|
+
output_dir_path = Path(output_dir)
|
239
242
|
os.makedirs(output_dir, exist_ok=True)
|
240
243
|
logger.info(f'[Processing File] Retrieving correct parser for "{input_path}"')
|
241
244
|
handler = get_handler(input_path, force_ext)
|
@@ -265,16 +268,29 @@ def process_file(input_path: str, output_dir: str, force_ext: str = '') -> str:
|
|
265
268
|
logger.info('[Processing File] Parsing HTML into sections.')
|
266
269
|
parser = HTMLSectionParser()
|
267
270
|
parsed_sections = parser.parse_sections(html_content)
|
271
|
+
parsed_sections_output_file = output_dir_path / 'parsed_sections.json'
|
272
|
+
with open(parsed_sections_output_file) as f:
|
273
|
+
f.write(json.dumps(parsed_sections))
|
268
274
|
|
269
275
|
logger.info('[Processing File] Splitting parsed sections and creating JSON digest.')
|
270
|
-
jsonl_output_path = os.path.join(output_dir, 'sections.jsonl')
|
271
276
|
|
272
277
|
all_nodes = []
|
273
278
|
if parsed_sections:
|
274
279
|
for section in parsed_sections:
|
275
280
|
all_nodes.extend(process_node(section, parent_digest_hash=None))
|
276
281
|
|
277
|
-
|
282
|
+
jsonl_output_path = output_dir_path / 'sections.jsonl'
|
283
|
+
write_stream_of_obj(all_nodes, str(jsonl_output_path))
|
278
284
|
logger.info(f'[Processing File] Successfully created JSON digest at {jsonl_output_path}')
|
279
285
|
|
286
|
+
logger.info('[Processing File] Starting to save individual chunks.')
|
287
|
+
chunks_path = output_dir_path / 'chunks'
|
288
|
+
os.makedirs(chunks_path, exist_ok=True)
|
289
|
+
|
290
|
+
for node in all_nodes:
|
291
|
+
chunk_path = chunks_path / f'{node["digest_hash"]}.json'
|
292
|
+
with open(chunk_path, 'w') as f:
|
293
|
+
json.dump(node, f)
|
294
|
+
|
295
|
+
logger.info('[Processing File] Successfully saved individual chunks.')
|
280
296
|
return final_html_path
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
from langdetect import detect
|
2
2
|
import sys
|
3
3
|
import argparse
|
4
4
|
import hashlib
|
@@ -88,6 +88,8 @@ def process_node(node: dict, parent_digest_hash: str | None = None) -> list[dict
|
|
88
88
|
"""
|
89
89
|
Recursively process a node and its subsections, returning a flat list of nodes.
|
90
90
|
"""
|
91
|
+
text = node.get('text', '')
|
92
|
+
language = detect(text)
|
91
93
|
section_digest = generate_section_digest(node)
|
92
94
|
digest_hash = compute_digest_hash(section_digest)
|
93
95
|
result = ProcessResultNode(
|
@@ -95,8 +97,9 @@ def process_node(node: dict, parent_digest_hash: str | None = None) -> list[dict
|
|
95
97
|
'digest_hash': digest_hash,
|
96
98
|
'parent_digest_hash': parent_digest_hash,
|
97
99
|
'title': node.get('title'),
|
98
|
-
'text':
|
100
|
+
'text': text,
|
99
101
|
'section_digest': section_digest,
|
102
|
+
'language': language,
|
100
103
|
}
|
101
104
|
)
|
102
105
|
result = asdict(result)
|
{content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction.egg-info/PKG-INFO
RENAMED
@@ -1,10 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: content_extraction
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.0
|
4
4
|
Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
|
5
5
|
Requires-Python: >=3.12
|
6
6
|
Description-Content-Type: text/markdown
|
7
7
|
Requires-Dist: beautifulsoup4>=4.13.4
|
8
|
+
Requires-Dist: dspy>=2.6.27
|
9
|
+
Requires-Dist: langdetect>=1.0.9
|
8
10
|
Requires-Dist: lxml>=6.0.0
|
9
11
|
Requires-Dist: python-pptx>=1.0.2
|
10
12
|
Requires-Dist: requests>=2.32.4
|
File without changes
|
File without changes
|
File without changes
|
{content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/common_std_io.py
RENAMED
File without changes
|
File without changes
|
{content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/dspy_modules.py
RENAMED
File without changes
|
{content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/extract_from_pptx.py
RENAMED
File without changes
|
File without changes
|
{content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/logging_config.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/process_document.sh
RENAMED
File without changes
|
{content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction/semantic_chunk_html.py
RENAMED
File without changes
|
{content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction.egg-info/SOURCES.txt
RENAMED
File without changes
|
File without changes
|
{content_extraction-0.3.1 → content_extraction-0.4.0}/src/content_extraction.egg-info/top_level.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|