polytext 0.2.4__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {polytext-0.2.4 → polytext-0.2.6}/PKG-INFO +2 -1
- {polytext-0.2.4 → polytext-0.2.6}/polytext/__init__.py +27 -2
- polytext-0.2.6/polytext/converter/beautiful_text.py +209 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/converter/pdf.py +16 -3
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/base.py +195 -13
- polytext-0.2.6/polytext/prompts/beautiful_text.py +43 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext.egg-info/PKG-INFO +2 -1
- {polytext-0.2.4 → polytext-0.2.6}/polytext.egg-info/SOURCES.txt +4 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext.egg-info/requires.txt +1 -0
- {polytext-0.2.4 → polytext-0.2.6}/setup.py +1 -1
- polytext-0.2.6/tests/test_base_loader_error_mapping.py +148 -0
- polytext-0.2.6/tests/test_beautiful_text_manual.py +68 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_get_audio_transcript_from_gcs.py +1 -1
- polytext-0.2.6/tests/test_pdf_conversion_error.py +43 -0
- polytext-0.2.4/tests/test_base_loader_error_mapping.py +0 -81
- {polytext-0.2.4 → polytext-0.2.6}/LICENSE +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/README.md +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/converter/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/converter/audio_to_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/converter/base.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/converter/document_ocr_to_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/converter/document_ocr_to_text_azure_oai.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/converter/gemini_quality_guards.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/converter/html_to_md.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/converter/md_to_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/converter/ocr_to_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/converter/ocr_to_text_azure_oai.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/converter/text_to_md.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/converter/video_to_audio.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/exceptions/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/exceptions/base.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/generator/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/generator/pdf.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/audio.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/document.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/document_ocr.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/downloader/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/downloader/downloader.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/html.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/markdown.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/notebook.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/ocr.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/plain_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/video.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/xml_xbrl.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/youtube.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/loader/youtube_llm.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/processor/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/processor/audio_chunker.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/processor/text_merger.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/processor/transcript_chunker.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/prompts/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/prompts/ocr.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/prompts/text_merging.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/prompts/text_to_md.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/prompts/transcription.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/utils/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext/utils/utils.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext.egg-info/dependency_links.txt +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext.egg-info/not-zip-safe +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/polytext.egg-info/top_level.txt +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/pyproject.toml +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/setup.cfg +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_audio_chunker.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_audio_comparison_helpers.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_audio_transcription_model_migration.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_compare_audio_models.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_compare_document_ocr_to_text_models.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_compare_ocr_to_text_models.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_compare_youtube_models.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_dowload_audio_from_youtube.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_extracted_text_whitespace.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_gemini_quality_guards.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_get_customized_pdf_from_markdown.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_get_document_ocr.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_get_document_ocr_azure_oai.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_get_document_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_get_document_text_from_gcs.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_get_ocr_from_image.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_get_text_from_markdown.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_get_video_transcript_from_gcs.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_library.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_markdown_loader_gzip.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_markitdown_html.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_notebook_loader.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_ocr_fallbacks.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_ocr_image_descriptions.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_pain_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_python_version_metadata.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_split_audio_with_llm.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_xml_xbrl_loader.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_youtube_gemini_minimal_check.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_youtube_llm_fallbacks.py +0 -0
- {polytext-0.2.4 → polytext-0.2.6}/tests/test_youtube_transcript.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: polytext
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Summary: Python utilities to simplify document files management
|
|
5
5
|
Home-page: https://github.com/docsity/polytext
|
|
6
6
|
Author: Matteo Senardi
|
|
@@ -21,6 +21,7 @@ Requires-Dist: PyMuPDF>=1.25.5
|
|
|
21
21
|
Requires-Dist: pycryptodome==3.23.0
|
|
22
22
|
Requires-Dist: weasyprint==65.1
|
|
23
23
|
Requires-Dist: markdown==3.8
|
|
24
|
+
Requires-Dist: markdown-to-json==2.1.2
|
|
24
25
|
Requires-Dist: python-docx==1.1.2
|
|
25
26
|
Requires-Dist: google-api-core>=2.24.2
|
|
26
27
|
Requires-Dist: google-cloud-storage<3.0.0,>=2.17
|
|
@@ -3,11 +3,36 @@ import os
|
|
|
3
3
|
import logging
|
|
4
4
|
import dotenv
|
|
5
5
|
|
|
6
|
+
from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError, LoaderError
|
|
7
|
+
|
|
6
8
|
logger = logging.getLogger(__name__)
|
|
7
9
|
|
|
8
10
|
# Load environment variables
|
|
9
11
|
dotenv.load_dotenv()
|
|
10
12
|
|
|
13
|
+
|
|
14
|
+
def _filter_expected_loader_errors(event, hint):
|
|
15
|
+
error = None
|
|
16
|
+
if hint:
|
|
17
|
+
exc_info = hint.get("exc_info")
|
|
18
|
+
if exc_info:
|
|
19
|
+
error = exc_info[1]
|
|
20
|
+
else:
|
|
21
|
+
error = hint.get("original_exception")
|
|
22
|
+
|
|
23
|
+
if isinstance(error, LoaderError) and error.code == "NO_TEXT_DETECTED":
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
exception_values = (event or {}).get("exception", {}).get("values", [])
|
|
27
|
+
for exception_value in exception_values:
|
|
28
|
+
exception_type = exception_value.get("type") or ""
|
|
29
|
+
exception_message = exception_value.get("value")
|
|
30
|
+
if exception_type.endswith("LoaderError") and exception_message == "No text detected":
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
return event
|
|
34
|
+
|
|
35
|
+
|
|
11
36
|
# Initialize Sentry if DSN is configured
|
|
12
37
|
sentry_dsn = os.getenv('SENTRY_DSN_POLYTEXT')
|
|
13
38
|
if sentry_dsn:
|
|
@@ -18,6 +43,7 @@ if sentry_dsn:
|
|
|
18
43
|
environment=os.getenv('ENV', 'prod'),
|
|
19
44
|
traces_sample_rate=1.0,
|
|
20
45
|
profiles_sample_rate=1.0,
|
|
46
|
+
before_send=_filter_expected_loader_errors,
|
|
21
47
|
)
|
|
22
48
|
logger.info("Sentry monitoring initialized")
|
|
23
49
|
except ImportError:
|
|
@@ -26,7 +52,6 @@ if sentry_dsn:
|
|
|
26
52
|
|
|
27
53
|
from .converter.pdf import convert_to_pdf, DocumentConverter
|
|
28
54
|
from .loader.document import DocumentLoader
|
|
29
|
-
from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError
|
|
30
55
|
from .generator.pdf import get_customized_pdf_from_markdown, PDFGenerator
|
|
31
56
|
|
|
32
57
|
__all__ = [
|
|
@@ -38,4 +63,4 @@ __all__ = [
|
|
|
38
63
|
'ConversionError',
|
|
39
64
|
'get_customized_pdf_from_markdown',
|
|
40
65
|
'PDFGenerator'
|
|
41
|
-
]
|
|
66
|
+
]
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
import time
|
|
4
|
+
from importlib import import_module
|
|
5
|
+
|
|
6
|
+
from google import genai
|
|
7
|
+
from google.genai import types
|
|
8
|
+
from google.api_core import exceptions as google_exceptions
|
|
9
|
+
from retry import retry
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
11
|
+
|
|
12
|
+
from polytext.processor.transcript_chunker import TranscriptChunker
|
|
13
|
+
from polytext.processor.text_merger import TextMerger
|
|
14
|
+
from polytext.prompts.beautiful_text import BEAUTIFUL_TEXT_PROMPT
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BeautifulTextConverter:
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
llm_api_key: str = None,
|
|
23
|
+
model: str = "gemini-3.1-flash-lite",
|
|
24
|
+
model_provider: str = "google",
|
|
25
|
+
max_llm_tokens: int = 8000,
|
|
26
|
+
prompt_overhead: int = 1800,
|
|
27
|
+
tokens_per_char: float = 0.25,
|
|
28
|
+
overlap_chars: int = 800,
|
|
29
|
+
) -> None:
|
|
30
|
+
self.llm_api_key = llm_api_key
|
|
31
|
+
self.model = model
|
|
32
|
+
self.model_provider = model_provider
|
|
33
|
+
self.max_llm_tokens = max_llm_tokens
|
|
34
|
+
self.prompt_overhead = prompt_overhead
|
|
35
|
+
self.tokens_per_char = tokens_per_char
|
|
36
|
+
self.overlap_chars = overlap_chars
|
|
37
|
+
|
|
38
|
+
def get_client(self):
|
|
39
|
+
return genai.Client(api_key=self.llm_api_key) if self.llm_api_key else genai.Client()
|
|
40
|
+
|
|
41
|
+
def chunk_raw_text(self, raw_text: str) -> list[dict]:
|
|
42
|
+
chunker = TranscriptChunker(
|
|
43
|
+
transcript=raw_text,
|
|
44
|
+
max_llm_tokens=self.max_llm_tokens,
|
|
45
|
+
prompt_overhead=self.prompt_overhead,
|
|
46
|
+
tokens_per_char=self.tokens_per_char,
|
|
47
|
+
overlap_chars=self.overlap_chars,
|
|
48
|
+
)
|
|
49
|
+
return chunker.chunk_transcript()
|
|
50
|
+
|
|
51
|
+
@retry(
|
|
52
|
+
(
|
|
53
|
+
google_exceptions.DeadlineExceeded,
|
|
54
|
+
google_exceptions.ResourceExhausted,
|
|
55
|
+
google_exceptions.ServiceUnavailable,
|
|
56
|
+
google_exceptions.InternalServerError,
|
|
57
|
+
),
|
|
58
|
+
tries=5,
|
|
59
|
+
delay=2,
|
|
60
|
+
backoff=2,
|
|
61
|
+
logger=logger,
|
|
62
|
+
)
|
|
63
|
+
def process_chunk(self, client, chunk_text: str, index: int) -> dict:
|
|
64
|
+
logger.info("Processing beautiful text chunk %s", index + 1)
|
|
65
|
+
start_time = time.time()
|
|
66
|
+
|
|
67
|
+
config = types.GenerateContentConfig(
|
|
68
|
+
safety_settings=[
|
|
69
|
+
types.SafetySetting(
|
|
70
|
+
category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
|
|
71
|
+
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
72
|
+
),
|
|
73
|
+
types.SafetySetting(
|
|
74
|
+
category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
|
|
75
|
+
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
76
|
+
),
|
|
77
|
+
types.SafetySetting(
|
|
78
|
+
category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
|
|
79
|
+
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
80
|
+
),
|
|
81
|
+
types.SafetySetting(
|
|
82
|
+
category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
|
|
83
|
+
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
84
|
+
),
|
|
85
|
+
]
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
response = client.models.generate_content(
|
|
89
|
+
model=self.model,
|
|
90
|
+
contents=[BEAUTIFUL_TEXT_PROMPT, chunk_text],
|
|
91
|
+
config=config,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
logger.info("Beautiful text chunk %s processed in %.2fs", index + 1, time.time() - start_time)
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
"transcript": response.text,
|
|
98
|
+
"completion_tokens": response.usage_metadata.candidates_token_count,
|
|
99
|
+
"prompt_tokens": response.usage_metadata.prompt_token_count,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
def merge_cleaned_chunks(self, chunks: list[str]) -> str:
|
|
103
|
+
return TextMerger(llm_api_key=self.llm_api_key).merge_chunks(chunks=chunks)
|
|
104
|
+
|
|
105
|
+
def _convert_markdown_to_json(self, markdown_text: str) -> dict:
|
|
106
|
+
if not markdown_text.strip():
|
|
107
|
+
return {}
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
markdown_to_json = import_module("markdown_to_json")
|
|
111
|
+
except ImportError as exc:
|
|
112
|
+
raise ImportError(
|
|
113
|
+
"markdown-to-json is required when active_chapters=True. "
|
|
114
|
+
"Install it with: pip install markdown-to-json"
|
|
115
|
+
) from exc
|
|
116
|
+
|
|
117
|
+
return markdown_to_json.dictify(markdown_text)
|
|
118
|
+
|
|
119
|
+
def _build_chapters(self, markdown_text: str) -> list[dict]:
|
|
120
|
+
heading_pattern = re.compile(r"^(#{1,6})\s+(.*?)\s*$")
|
|
121
|
+
chapters = []
|
|
122
|
+
stack: list[dict] = []
|
|
123
|
+
|
|
124
|
+
def finalize_nodes(target_depth: int = 0) -> None:
|
|
125
|
+
while len(stack) > target_depth:
|
|
126
|
+
node_state = stack.pop()
|
|
127
|
+
node_state["node"]["content"] = "\n".join(node_state["content_lines"]).strip()
|
|
128
|
+
|
|
129
|
+
for line in markdown_text.splitlines():
|
|
130
|
+
heading_match = heading_pattern.match(line)
|
|
131
|
+
if heading_match:
|
|
132
|
+
level = len(heading_match.group(1))
|
|
133
|
+
title = heading_match.group(2).strip()
|
|
134
|
+
|
|
135
|
+
while stack and stack[-1]["node"]["level"] >= level:
|
|
136
|
+
finalize_nodes(len(stack) - 1)
|
|
137
|
+
|
|
138
|
+
chapter_node = {
|
|
139
|
+
"title": title,
|
|
140
|
+
"level": level,
|
|
141
|
+
"content": "",
|
|
142
|
+
"children": [],
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
if stack:
|
|
146
|
+
stack[-1]["node"]["children"].append(chapter_node)
|
|
147
|
+
else:
|
|
148
|
+
chapters.append(chapter_node)
|
|
149
|
+
|
|
150
|
+
stack.append({"node": chapter_node, "content_lines": []})
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
if stack:
|
|
154
|
+
stack[-1]["content_lines"].append(line)
|
|
155
|
+
|
|
156
|
+
finalize_nodes()
|
|
157
|
+
return chapters
|
|
158
|
+
|
|
159
|
+
def convert(self, raw_text: str, save_transcript_chunks: bool = False, active_chapters: bool = False) -> dict:
|
|
160
|
+
cleaned_input = (raw_text or "").strip()
|
|
161
|
+
if not cleaned_input:
|
|
162
|
+
result = {
|
|
163
|
+
"text": "",
|
|
164
|
+
"completion_tokens": 0,
|
|
165
|
+
"prompt_tokens": 0,
|
|
166
|
+
"completion_model": self.model,
|
|
167
|
+
"completion_model_provider": self.model_provider,
|
|
168
|
+
"text_chunks": [] if save_transcript_chunks else "not provided",
|
|
169
|
+
}
|
|
170
|
+
if active_chapters:
|
|
171
|
+
result["markdown_json"] = {}
|
|
172
|
+
result["chapters"] = []
|
|
173
|
+
return result
|
|
174
|
+
|
|
175
|
+
chunks = self.chunk_raw_text(cleaned_input)
|
|
176
|
+
client = self.get_client()
|
|
177
|
+
|
|
178
|
+
results = []
|
|
179
|
+
total_completion_tokens = 0
|
|
180
|
+
total_prompt_tokens = 0
|
|
181
|
+
|
|
182
|
+
with ThreadPoolExecutor() as executor:
|
|
183
|
+
future_to_index = {
|
|
184
|
+
executor.submit(self.process_chunk, client, chunk["text"], chunk["index"]): chunk["index"]
|
|
185
|
+
for chunk in chunks
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
for future in as_completed(future_to_index):
|
|
189
|
+
index = future_to_index[future]
|
|
190
|
+
result = future.result()
|
|
191
|
+
results.append((index, result["transcript"]))
|
|
192
|
+
total_completion_tokens += result["completion_tokens"]
|
|
193
|
+
total_prompt_tokens += result["prompt_tokens"]
|
|
194
|
+
|
|
195
|
+
cleaned_chunks = [text for index, text in sorted(results, key=lambda item: item[0])]
|
|
196
|
+
final_text = self.merge_cleaned_chunks(cleaned_chunks)
|
|
197
|
+
|
|
198
|
+
result = {
|
|
199
|
+
"text": final_text,
|
|
200
|
+
"completion_tokens": total_completion_tokens,
|
|
201
|
+
"prompt_tokens": total_prompt_tokens,
|
|
202
|
+
"completion_model": self.model,
|
|
203
|
+
"completion_model_provider": self.model_provider,
|
|
204
|
+
"text_chunks": cleaned_chunks if save_transcript_chunks else "not provided",
|
|
205
|
+
}
|
|
206
|
+
if active_chapters:
|
|
207
|
+
result["markdown_json"] = self._convert_markdown_to_json(final_text)
|
|
208
|
+
result["chapters"] = self._build_chapters(final_text)
|
|
209
|
+
return result
|
|
@@ -127,11 +127,24 @@ class DocumentConverter:
|
|
|
127
127
|
]
|
|
128
128
|
|
|
129
129
|
try:
|
|
130
|
-
|
|
131
|
-
|
|
130
|
+
subprocess.run(
|
|
131
|
+
command,
|
|
132
|
+
stdout=subprocess.PIPE,
|
|
133
|
+
stderr=subprocess.PIPE,
|
|
134
|
+
text=True,
|
|
135
|
+
check=True,
|
|
136
|
+
)
|
|
132
137
|
logger.info(f"Conversion successful: '{output_file}'")
|
|
133
138
|
except subprocess.CalledProcessError as e:
|
|
139
|
+
output_parts = []
|
|
140
|
+
if e.stdout:
|
|
141
|
+
output_parts.append(f"stdout: {e.stdout.strip()}")
|
|
142
|
+
if e.stderr:
|
|
143
|
+
output_parts.append(f"stderr: {e.stderr.strip()}")
|
|
144
|
+
details = "\n".join(output_parts)
|
|
134
145
|
error_msg = f"Error during conversion: {e}"
|
|
146
|
+
if details:
|
|
147
|
+
error_msg = f"{error_msg}\n{details}"
|
|
135
148
|
logger.info(error_msg)
|
|
136
149
|
raise ConversionError(error_msg, e)
|
|
137
150
|
|
|
@@ -253,4 +266,4 @@ class DocumentConverter:
|
|
|
253
266
|
# except Exception as e:
|
|
254
267
|
# error_msg = f"Error during PDF conversion: {str(e)}"
|
|
255
268
|
# logger.error(error_msg)
|
|
256
|
-
# raise ConversionError(error_msg)
|
|
269
|
+
# raise ConversionError(error_msg)
|
|
@@ -25,7 +25,7 @@ from ..loader import (
|
|
|
25
25
|
XmlXbrlLoader,
|
|
26
26
|
NotebookLoader
|
|
27
27
|
)
|
|
28
|
-
from ..exceptions import EmptyDocument, LoaderTimeoutError, LoaderError
|
|
28
|
+
from ..exceptions import ConversionError, EmptyDocument, LoaderTimeoutError, LoaderError
|
|
29
29
|
from ..utils.utils import clean_extracted_text_whitespace, remove_markdown_strip
|
|
30
30
|
|
|
31
31
|
# External imports
|
|
@@ -33,6 +33,8 @@ import boto3
|
|
|
33
33
|
from google.cloud import storage
|
|
34
34
|
from google.genai import errors as genai_errors
|
|
35
35
|
|
|
36
|
+
from ..converter.beautiful_text import BeautifulTextConverter
|
|
37
|
+
|
|
36
38
|
|
|
37
39
|
dotenv.load_dotenv()
|
|
38
40
|
|
|
@@ -46,6 +48,10 @@ LLM_OUTPUT_ERROR_CODES = {
|
|
|
46
48
|
997: "REPETITIVE_OUTPUT",
|
|
47
49
|
999: "MAX_TOKENS",
|
|
48
50
|
}
|
|
51
|
+
EMPTY_DOCUMENT_LOADER_ERROR_CODES = {
|
|
52
|
+
**LLM_OUTPUT_ERROR_CODES,
|
|
53
|
+
998: "NO_TEXT_DETECTED",
|
|
54
|
+
}
|
|
49
55
|
|
|
50
56
|
|
|
51
57
|
def _read_bool_env(name: str, default: bool = False) -> bool:
|
|
@@ -67,6 +73,20 @@ def _capture_exception_for_sentry(error: Exception) -> None:
|
|
|
67
73
|
return
|
|
68
74
|
|
|
69
75
|
|
|
76
|
+
def _raise_empty_document_loader_error(error: EmptyDocument) -> None:
|
|
77
|
+
loader_error_code = EMPTY_DOCUMENT_LOADER_ERROR_CODES.get(error.code, "NO_TEXT_DETECTED")
|
|
78
|
+
message = error.message
|
|
79
|
+
if loader_error_code == "NO_TEXT_DETECTED":
|
|
80
|
+
message = "No text detected"
|
|
81
|
+
else:
|
|
82
|
+
_capture_exception_for_sentry(error)
|
|
83
|
+
raise LoaderError(
|
|
84
|
+
message=message,
|
|
85
|
+
status=422,
|
|
86
|
+
code=loader_error_code,
|
|
87
|
+
) from error
|
|
88
|
+
|
|
89
|
+
|
|
70
90
|
class BaseLoader:
|
|
71
91
|
def __init__(self, markdown_output=True, llm_api_key=None, provider: str = "google", temp_dir: str = "temp",
|
|
72
92
|
ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None,
|
|
@@ -166,22 +186,23 @@ class BaseLoader:
|
|
|
166
186
|
response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
|
|
167
187
|
except EmptyDocument as e:
|
|
168
188
|
if e.code in LLM_OUTPUT_ERROR_CODES:
|
|
169
|
-
|
|
170
|
-
raise LoaderError(
|
|
171
|
-
message=e.message,
|
|
172
|
-
status=422,
|
|
173
|
-
code=LLM_OUTPUT_ERROR_CODES[e.code],
|
|
174
|
-
) from e
|
|
175
|
-
logger.info(f"Empty document encountered: {e.message}")
|
|
189
|
+
_raise_empty_document_loader_error(e)
|
|
176
190
|
if self.fallback_ocr:
|
|
177
191
|
loader_class = self.init_loader_class(input=first_file_url, storage_client=storage_client,
|
|
178
192
|
llm_api_key=self.llm_api_key, is_document_fallback=True, **kwargs)
|
|
179
|
-
|
|
193
|
+
try:
|
|
194
|
+
response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
|
|
195
|
+
except EmptyDocument as fallback_error:
|
|
196
|
+
_raise_empty_document_loader_error(fallback_error)
|
|
180
197
|
else:
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
198
|
+
_raise_empty_document_loader_error(e)
|
|
199
|
+
except ConversionError as e:
|
|
200
|
+
_capture_exception_for_sentry(e)
|
|
201
|
+
raise LoaderError(
|
|
202
|
+
message=e.message,
|
|
203
|
+
status=422,
|
|
204
|
+
code="CONVERSION_ERROR",
|
|
205
|
+
) from e
|
|
185
206
|
except LoaderTimeoutError:
|
|
186
207
|
raise LoaderError(message="timeout gemini", status=504, code="TIMEOUT")
|
|
187
208
|
except (httpx.ReadTimeout,
|
|
@@ -216,6 +237,59 @@ class BaseLoader:
|
|
|
216
237
|
|
|
217
238
|
return response
|
|
218
239
|
|
|
240
|
+
def get_beautiful_text(self, input_list: list[str], **kwargs):
|
|
241
|
+
if not isinstance(input_list, list) or not all(isinstance(item, str) for item in input_list):
|
|
242
|
+
raise TypeError("Parameter 'input' must be a list of strings.")
|
|
243
|
+
if not input_list:
|
|
244
|
+
raise ValueError("Input list is empty.")
|
|
245
|
+
if len(input_list) != 1:
|
|
246
|
+
raise ValueError("get_beautiful_text expects exactly one input.")
|
|
247
|
+
|
|
248
|
+
kwargs = {**self.kwargs, **kwargs}
|
|
249
|
+
raw_result = self.extract_raw_text_for_beautiful_text(input_value=input_list[0], **kwargs)
|
|
250
|
+
|
|
251
|
+
converter = BeautifulTextConverter(llm_api_key=self.llm_api_key)
|
|
252
|
+
cleanup_result = converter.convert(
|
|
253
|
+
raw_text=raw_result["text"],
|
|
254
|
+
save_transcript_chunks=kwargs.get("save_transcript_chunks", self.save_transcript_chunks),
|
|
255
|
+
active_chapters=kwargs.get("active_chapters", False),
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
total_completion_tokens = raw_result.get("completion_tokens", 0) + cleanup_result.get("completion_tokens", 0)
|
|
259
|
+
total_prompt_tokens = raw_result.get("prompt_tokens", 0) + cleanup_result.get("prompt_tokens", 0)
|
|
260
|
+
|
|
261
|
+
result_item = {
|
|
262
|
+
"text": cleanup_result["text"],
|
|
263
|
+
"completion_tokens": total_completion_tokens,
|
|
264
|
+
"prompt_tokens": total_prompt_tokens,
|
|
265
|
+
"completion_model": cleanup_result.get("completion_model", "not provided"),
|
|
266
|
+
"completion_model_provider": cleanup_result.get("completion_model_provider", "not provided"),
|
|
267
|
+
"text_chunks": cleanup_result.get("text_chunks", "not provided"),
|
|
268
|
+
"type": raw_result.get("type", "text"),
|
|
269
|
+
"input": input_list[0],
|
|
270
|
+
}
|
|
271
|
+
if "markdown_json" in cleanup_result:
|
|
272
|
+
result_item["markdown_json"] = cleanup_result["markdown_json"]
|
|
273
|
+
if "chapters" in cleanup_result:
|
|
274
|
+
result_item["chapters"] = cleanup_result["chapters"]
|
|
275
|
+
|
|
276
|
+
response = {
|
|
277
|
+
"text": result_item["text"],
|
|
278
|
+
"completion_tokens": result_item["completion_tokens"],
|
|
279
|
+
"prompt_tokens": result_item["prompt_tokens"],
|
|
280
|
+
"completion_model": result_item["completion_model"],
|
|
281
|
+
"completion_model_provider": result_item["completion_model_provider"],
|
|
282
|
+
"text_chunks": result_item["text_chunks"],
|
|
283
|
+
"type": result_item["type"],
|
|
284
|
+
"input": result_item["input"],
|
|
285
|
+
"output_list": [result_item],
|
|
286
|
+
}
|
|
287
|
+
if "markdown_json" in result_item:
|
|
288
|
+
response["markdown_json"] = result_item["markdown_json"]
|
|
289
|
+
if "chapters" in result_item:
|
|
290
|
+
response["chapters"] = result_item["chapters"]
|
|
291
|
+
return response
|
|
292
|
+
|
|
219
293
|
def initiate_storage(self, input: str) -> dict:
|
|
220
294
|
"""
|
|
221
295
|
Initializes and returns a client and relevant details for various cloud storage services or web URLs.
|
|
@@ -499,6 +573,114 @@ class BaseLoader:
|
|
|
499
573
|
return True
|
|
500
574
|
return False
|
|
501
575
|
|
|
576
|
+
@staticmethod
|
|
577
|
+
def is_remote_input(s: str) -> bool:
|
|
578
|
+
return s.startswith(("s3://", "gcs://", "http://", "https://", "www.", "www.youtube"))
|
|
579
|
+
|
|
580
|
+
@staticmethod
|
|
581
|
+
def is_text_file_extension(path_value: str) -> bool:
|
|
582
|
+
return Path(path_value).suffix.lower() in {".txt", ".text", ".md", ".markdown"}
|
|
583
|
+
|
|
584
|
+
@staticmethod
|
|
585
|
+
def is_beautiful_text_supported_file_extension(path_value: str) -> bool:
|
|
586
|
+
return Path(path_value).suffix.lower() in {
|
|
587
|
+
".txt",
|
|
588
|
+
".text",
|
|
589
|
+
".md",
|
|
590
|
+
".markdown",
|
|
591
|
+
".pdf",
|
|
592
|
+
".xlsx",
|
|
593
|
+
".docx",
|
|
594
|
+
".csv",
|
|
595
|
+
".odt",
|
|
596
|
+
".pptx",
|
|
597
|
+
".xls",
|
|
598
|
+
".doc",
|
|
599
|
+
".ppt",
|
|
600
|
+
".rtf",
|
|
601
|
+
".ipynb",
|
|
602
|
+
".xml",
|
|
603
|
+
".xbrl",
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
def extract_raw_text_for_beautiful_text(self, input_value: str, **kwargs) -> dict:
|
|
607
|
+
cleaned_input = input_value.strip()
|
|
608
|
+
|
|
609
|
+
if "\n" in cleaned_input or (not self.is_local_path(cleaned_input) and not self.is_remote_input(cleaned_input)):
|
|
610
|
+
return {
|
|
611
|
+
"text": cleaned_input,
|
|
612
|
+
"completion_tokens": 0,
|
|
613
|
+
"prompt_tokens": 0,
|
|
614
|
+
"completion_model": "not provided",
|
|
615
|
+
"completion_model_provider": "not provided",
|
|
616
|
+
"text_chunks": "not provided",
|
|
617
|
+
"type": "text",
|
|
618
|
+
"input": input_value,
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
local_path = Path(cleaned_input)
|
|
622
|
+
|
|
623
|
+
if local_path.exists():
|
|
624
|
+
if local_path.is_file() and self.is_text_file_extension(cleaned_input):
|
|
625
|
+
return {
|
|
626
|
+
"text": local_path.read_text(encoding="utf-8"),
|
|
627
|
+
"completion_tokens": 0,
|
|
628
|
+
"prompt_tokens": 0,
|
|
629
|
+
"completion_model": "not provided",
|
|
630
|
+
"completion_model_provider": "not provided",
|
|
631
|
+
"text_chunks": "not provided",
|
|
632
|
+
"type": "text",
|
|
633
|
+
"input": input_value,
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
if self.is_local_path(cleaned_input) and not self.is_remote_input(cleaned_input):
|
|
637
|
+
if not local_path.exists():
|
|
638
|
+
raise FileNotFoundError(f"Input not found or format not recognized: {input_value}")
|
|
639
|
+
|
|
640
|
+
if not self.is_beautiful_text_supported_file_extension(cleaned_input):
|
|
641
|
+
raise ValueError(
|
|
642
|
+
"get_beautiful_text supports only text or document inputs such as txt, md, pdf, docx, xlsx, csv, ipynb, xml, or xbrl."
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
if self.is_remote_input(cleaned_input):
|
|
646
|
+
if cleaned_input.startswith(("http://", "https://", "www.", "www.youtube")):
|
|
647
|
+
raise ValueError(
|
|
648
|
+
"get_beautiful_text does not support web pages, YouTube, audio, video, or image URLs. Pass text directly or a text/document file path."
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
if not self.is_beautiful_text_supported_file_extension(cleaned_input):
|
|
652
|
+
raise ValueError(
|
|
653
|
+
"get_beautiful_text supports only text or document file inputs such as txt, md, pdf, docx, xlsx, csv, ipynb, xml, or xbrl."
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
storage_client = self.initiate_storage(input=input_value)
|
|
657
|
+
loader_class = self.init_loader_class(
|
|
658
|
+
input=input_value,
|
|
659
|
+
storage_client=storage_client,
|
|
660
|
+
llm_api_key=self.llm_api_key,
|
|
661
|
+
**kwargs,
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
unsupported_loader_types = (AudioLoader, VideoLoader, OCRLoader, HtmlLoader, YoutubeTranscriptLoaderWithLlm)
|
|
665
|
+
if isinstance(loader_class, unsupported_loader_types):
|
|
666
|
+
raise ValueError(
|
|
667
|
+
"get_beautiful_text supports only text or document inputs, not audio, video, image, HTML, or YouTube sources."
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
extracted = self.run_loader_class(loader_class=loader_class, input_list=[input_value])
|
|
671
|
+
output_item = extracted.get("output_list", [{}])[0]
|
|
672
|
+
|
|
673
|
+
return {
|
|
674
|
+
"text": extracted.get("text", ""),
|
|
675
|
+
"completion_tokens": extracted.get("completion_tokens", 0),
|
|
676
|
+
"prompt_tokens": extracted.get("prompt_tokens", 0),
|
|
677
|
+
"completion_model": output_item.get("completion_model", "not provided"),
|
|
678
|
+
"completion_model_provider": output_item.get("completion_model_provider", "not provided"),
|
|
679
|
+
"text_chunks": output_item.get("text_chunks", "not provided"),
|
|
680
|
+
"type": output_item.get("type", "not provided"),
|
|
681
|
+
"input": output_item.get("input", input_value),
|
|
682
|
+
}
|
|
683
|
+
|
|
502
684
|
def validate_user_text(self, text: str) -> bool:
|
|
503
685
|
"""
|
|
504
686
|
Validate a text string. Raises EmptyDocument if the text is too short.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
BEAUTIFUL_TEXT_PROMPT = """
|
|
2
|
+
You are an editor specialized in cleaning spoken transcripts and raw text into faithful Markdown.
|
|
3
|
+
This is not summarization. This is not rewriting. This is a cleaned transcript or cleaned source text.
|
|
4
|
+
|
|
5
|
+
Your task is to remove only accidental noise while preserving the speaker's or author's original words,
|
|
6
|
+
phrasing, reasoning, tone, and sequence of ideas as faithfully as possible.
|
|
7
|
+
|
|
8
|
+
REMOVE ONLY:
|
|
9
|
+
- non-meaningful fillers such as "eh", "uhm", "diciamo", "eccetera eccetera", "no?" when used only as filler
|
|
10
|
+
- redundant "quindi", "appunto", "comunque" when they are only conversational padding
|
|
11
|
+
- accidental repeated words such as "di di", "da da", "che che"
|
|
12
|
+
- false starts and self-corrections only when they do not carry meaning
|
|
13
|
+
- irrelevant overlap fragments between speakers
|
|
14
|
+
|
|
15
|
+
PRESERVE COMPLETELY:
|
|
16
|
+
- the original wording and sentence structure, even if colloquial
|
|
17
|
+
- technical terms and proper nouns exactly
|
|
18
|
+
- the original tone and register
|
|
19
|
+
- reasoning, opinions, nuances, and meaningful uncertainty
|
|
20
|
+
- the logical order of the discussion
|
|
21
|
+
|
|
22
|
+
DO NOT:
|
|
23
|
+
- rewrite sentences in a more elegant style
|
|
24
|
+
- replace words with synonyms
|
|
25
|
+
- summarize, compress, or simplify concepts
|
|
26
|
+
- add explanations, transitions, or missing content
|
|
27
|
+
- correct the speaker's opinions or inaccuracies
|
|
28
|
+
- make the language more formal than the original
|
|
29
|
+
|
|
30
|
+
FORMATTING:
|
|
31
|
+
- output Markdown only
|
|
32
|
+
- use paragraphs to separate thematic blocks
|
|
33
|
+
- add headings only when the speaker explicitly introduces a new topic
|
|
34
|
+
- use bullet lists or numbered lists only when the source explicitly enumerates items or when the sequence is clearly list-shaped
|
|
35
|
+
- use emphasis sparingly and only when grounded in the original text
|
|
36
|
+
- use **bold** for key information and important concepts, and *italics* for subtle emphasis or contextual terms in every chapter and paragraph whenever they improve readability and understanding
|
|
37
|
+
- do not add code fences
|
|
38
|
+
- do not add introductions or commentary
|
|
39
|
+
|
|
40
|
+
FINAL CHECK:
|
|
41
|
+
- every sentence in the output must be traceable to an equivalent sentence in the input
|
|
42
|
+
- if a sentence cannot be grounded in the input, remove it
|
|
43
|
+
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: polytext
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Summary: Python utilities to simplify document files management
|
|
5
5
|
Home-page: https://github.com/docsity/polytext
|
|
6
6
|
Author: Matteo Senardi
|
|
@@ -21,6 +21,7 @@ Requires-Dist: PyMuPDF>=1.25.5
|
|
|
21
21
|
Requires-Dist: pycryptodome==3.23.0
|
|
22
22
|
Requires-Dist: weasyprint==65.1
|
|
23
23
|
Requires-Dist: markdown==3.8
|
|
24
|
+
Requires-Dist: markdown-to-json==2.1.2
|
|
24
25
|
Requires-Dist: python-docx==1.1.2
|
|
25
26
|
Requires-Dist: google-api-core>=2.24.2
|
|
26
27
|
Requires-Dist: google-cloud-storage<3.0.0,>=2.17
|
|
@@ -12,6 +12,7 @@ polytext.egg-info/top_level.txt
|
|
|
12
12
|
polytext/converter/__init__.py
|
|
13
13
|
polytext/converter/audio_to_text.py
|
|
14
14
|
polytext/converter/base.py
|
|
15
|
+
polytext/converter/beautiful_text.py
|
|
15
16
|
polytext/converter/document_ocr_to_text.py
|
|
16
17
|
polytext/converter/document_ocr_to_text_azure_oai.py
|
|
17
18
|
polytext/converter/gemini_quality_guards.py
|
|
@@ -47,6 +48,7 @@ polytext/processor/audio_chunker.py
|
|
|
47
48
|
polytext/processor/text_merger.py
|
|
48
49
|
polytext/processor/transcript_chunker.py
|
|
49
50
|
polytext/prompts/__init__.py
|
|
51
|
+
polytext/prompts/beautiful_text.py
|
|
50
52
|
polytext/prompts/ocr.py
|
|
51
53
|
polytext/prompts/text_merging.py
|
|
52
54
|
polytext/prompts/text_to_md.py
|
|
@@ -57,6 +59,7 @@ tests/test_audio_chunker.py
|
|
|
57
59
|
tests/test_audio_comparison_helpers.py
|
|
58
60
|
tests/test_audio_transcription_model_migration.py
|
|
59
61
|
tests/test_base_loader_error_mapping.py
|
|
62
|
+
tests/test_beautiful_text_manual.py
|
|
60
63
|
tests/test_compare_audio_models.py
|
|
61
64
|
tests/test_compare_document_ocr_to_text_models.py
|
|
62
65
|
tests/test_compare_ocr_to_text_models.py
|
|
@@ -81,6 +84,7 @@ tests/test_notebook_loader.py
|
|
|
81
84
|
tests/test_ocr_fallbacks.py
|
|
82
85
|
tests/test_ocr_image_descriptions.py
|
|
83
86
|
tests/test_pain_text.py
|
|
87
|
+
tests/test_pdf_conversion_error.py
|
|
84
88
|
tests/test_python_version_metadata.py
|
|
85
89
|
tests/test_split_audio_with_llm.py
|
|
86
90
|
tests/test_xml_xbrl_loader.py
|