nl-processing 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nl_processing-0.1.0/PKG-INFO +22 -0
- nl_processing-0.1.0/README.md +11 -0
- nl_processing-0.1.0/nl_processing/__init__.py +0 -0
- nl_processing-0.1.0/nl_processing/core/__init__.py +0 -0
- nl_processing-0.1.0/nl_processing/core/exceptions.py +10 -0
- nl_processing-0.1.0/nl_processing/core/models.py +21 -0
- nl_processing-0.1.0/nl_processing/core/prompts.py +98 -0
- nl_processing-0.1.0/nl_processing/core/scripts/prompt_author.py +59 -0
- nl_processing-0.1.0/nl_processing/database/__init__.py +0 -0
- nl_processing-0.1.0/nl_processing/database/service.py +2 -0
- nl_processing-0.1.0/nl_processing/extract_text_from_image/__init__.py +1 -0
- nl_processing-0.1.0/nl_processing/extract_text_from_image/benchmark.py +57 -0
- nl_processing-0.1.0/nl_processing/extract_text_from_image/image_encoding.py +64 -0
- nl_processing-0.1.0/nl_processing/extract_text_from_image/prompts/generate_nl_prompt.py +159 -0
- nl_processing-0.1.0/nl_processing/extract_text_from_image/service.py +83 -0
- nl_processing-0.1.0/nl_processing/extract_words_from_text/__init__.py +0 -0
- nl_processing-0.1.0/nl_processing/extract_words_from_text/prompts/generate_nl_prompt.py +122 -0
- nl_processing-0.1.0/nl_processing/extract_words_from_text/service.py +54 -0
- nl_processing-0.1.0/nl_processing/translate_text/__init__.py +0 -0
- nl_processing-0.1.0/nl_processing/translate_text/prompts/generate_nl_ru_prompt.py +88 -0
- nl_processing-0.1.0/nl_processing/translate_text/service.py +58 -0
- nl_processing-0.1.0/nl_processing/translate_word/__init__.py +0 -0
- nl_processing-0.1.0/nl_processing/translate_word/prompts/generate_nl_ru_prompt.py +112 -0
- nl_processing-0.1.0/nl_processing/translate_word/service.py +67 -0
- nl_processing-0.1.0/nl_processing.egg-info/PKG-INFO +22 -0
- nl_processing-0.1.0/nl_processing.egg-info/SOURCES.txt +29 -0
- nl_processing-0.1.0/nl_processing.egg-info/dependency_links.txt +1 -0
- nl_processing-0.1.0/nl_processing.egg-info/requires.txt +4 -0
- nl_processing-0.1.0/nl_processing.egg-info/top_level.txt +1 -0
- nl_processing-0.1.0/pyproject.toml +31 -0
- nl_processing-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nl_processing
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Natural language processing playground
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: pydantic<3,>=2.0
|
|
8
|
+
Requires-Dist: langchain<1,>=0.3
|
|
9
|
+
Requires-Dist: langchain-openai<1,>=0.3
|
|
10
|
+
Requires-Dist: opencv-python<5,>=4.10
|
|
11
|
+
|
|
12
|
+
# nl_processing
|
|
13
|
+
|
|
14
|
+
Minimal NLP processing project scaffolded from `proto_tg_bot` structure.
|
|
15
|
+
|
|
16
|
+
## Directory Structure
|
|
17
|
+
|
|
18
|
+
| Directory | Description |
|
|
19
|
+
|-------------------|-------------------------------------|
|
|
20
|
+
| `nl_processing/` | Main package source code |
|
|
21
|
+
| `tests/` | Unit, integration, and e2e tests |
|
|
22
|
+
| `scripts/` | Utility/dev scripts |
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# nl_processing
|
|
2
|
+
|
|
3
|
+
Minimal NLP processing project scaffolded from `proto_tg_bot` structure.
|
|
4
|
+
|
|
5
|
+
## Directory Structure
|
|
6
|
+
|
|
7
|
+
| Directory | Description |
|
|
8
|
+
|-------------------|-------------------------------------|
|
|
9
|
+
| `nl_processing/` | Main package source code |
|
|
10
|
+
| `tests/` | Unit, integration, and e2e tests |
|
|
11
|
+
| `scripts/` | Utility/dev scripts |
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
class APIError(Exception):
|
|
2
|
+
"""Wraps upstream OpenAI/LangChain API failures."""
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TargetLanguageNotFoundError(Exception):
|
|
6
|
+
"""Raised when no text in the target language is detected."""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class UnsupportedImageFormatError(Exception):
|
|
10
|
+
"""Raised when the image format is not supported by the OpenAI API."""
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Language(Enum):
|
|
7
|
+
NL = "nl"
|
|
8
|
+
RU = "ru"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ExtractedText(BaseModel):
|
|
12
|
+
text: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class WordEntry(BaseModel):
|
|
16
|
+
normalized_form: str
|
|
17
|
+
word_type: str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TranslationResult(BaseModel):
|
|
21
|
+
translation: str
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pathlib
|
|
3
|
+
|
|
4
|
+
from langchain_core.load import load
|
|
5
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
6
|
+
from langchain_core.runnables import RunnableSerializable
|
|
7
|
+
from langchain_openai import ChatOpenAI
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from nl_processing.core.models import Language
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def build_translation_chain(
|
|
14
|
+
*,
|
|
15
|
+
source_language: Language,
|
|
16
|
+
target_language: Language,
|
|
17
|
+
supported_pairs: set[tuple[str, str]],
|
|
18
|
+
prompts_dir: pathlib.Path,
|
|
19
|
+
tool_schema: type[BaseModel],
|
|
20
|
+
model: str,
|
|
21
|
+
) -> RunnableSerializable: # type: ignore[type-arg]
|
|
22
|
+
"""Validate a language pair, load its prompt, and return a prompt|llm chain.
|
|
23
|
+
|
|
24
|
+
This is shared infrastructure for translation-style services that follow the
|
|
25
|
+
pattern: validate pair → load JSON prompt → bind_tools → compose chain.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
source_language: Source language enum value.
|
|
29
|
+
target_language: Target language enum value.
|
|
30
|
+
supported_pairs: Set of (src, tgt) value strings that are allowed.
|
|
31
|
+
prompts_dir: Directory containing ``<src>_<tgt>.json`` prompt files.
|
|
32
|
+
tool_schema: Pydantic model class to bind as a tool.
|
|
33
|
+
model: OpenAI model identifier string.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
A ``prompt | llm`` RunnableSerializable ready for ``ainvoke()``.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If the language pair is not in *supported_pairs*.
|
|
40
|
+
"""
|
|
41
|
+
pair = (source_language.value, target_language.value)
|
|
42
|
+
if pair not in supported_pairs:
|
|
43
|
+
msg = (
|
|
44
|
+
f"Unsupported language pair: "
|
|
45
|
+
f"{source_language.value} -> {target_language.value}. "
|
|
46
|
+
f"Supported pairs: {supported_pairs}"
|
|
47
|
+
)
|
|
48
|
+
raise ValueError(msg)
|
|
49
|
+
|
|
50
|
+
prompt_file = f"{source_language.value}_{target_language.value}.json"
|
|
51
|
+
prompt = load_prompt(str(prompts_dir / prompt_file))
|
|
52
|
+
|
|
53
|
+
llm = ChatOpenAI(model=model, temperature=0).bind_tools(
|
|
54
|
+
[tool_schema],
|
|
55
|
+
tool_choice=tool_schema.__name__,
|
|
56
|
+
)
|
|
57
|
+
return prompt | llm
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def load_prompt(prompt_path: str) -> ChatPromptTemplate:
|
|
61
|
+
"""Load a ChatPromptTemplate from a LangChain-serialized JSON file.
|
|
62
|
+
|
|
63
|
+
The JSON file must contain the output of ``langchain_core.load.dumpd(prompt)``.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
prompt_path: Path to the prompt JSON file in LangChain native format.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
A ChatPromptTemplate ready for chain composition.
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
FileNotFoundError: If the prompt file does not exist.
|
|
73
|
+
ValueError: If the JSON file is malformed or cannot be deserialized.
|
|
74
|
+
TypeError: If the file content is not a JSON object or the deserialized
|
|
75
|
+
object is not a ChatPromptTemplate.
|
|
76
|
+
"""
|
|
77
|
+
path = pathlib.Path(prompt_path)
|
|
78
|
+
if not path.exists():
|
|
79
|
+
raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
with path.open("r", encoding="utf-8") as f:
|
|
83
|
+
data = json.load(f)
|
|
84
|
+
except json.JSONDecodeError as e:
|
|
85
|
+
raise ValueError(f"Invalid JSON in prompt file {prompt_path}: {e}") from e
|
|
86
|
+
|
|
87
|
+
if not isinstance(data, dict):
|
|
88
|
+
raise TypeError(f"Prompt file must contain a JSON object, got {type(data).__name__}")
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
prompt = load(data)
|
|
92
|
+
except Exception as e:
|
|
93
|
+
raise ValueError(f"Failed to deserialize ChatPromptTemplate from {prompt_path}: {e}") from e
|
|
94
|
+
|
|
95
|
+
if not isinstance(prompt, ChatPromptTemplate):
|
|
96
|
+
raise TypeError(f"Expected ChatPromptTemplate, got {type(prompt).__name__} from {prompt_path}")
|
|
97
|
+
|
|
98
|
+
return prompt
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Prompt authoring helper — serialize ChatPromptTemplate to JSON.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
1. Edit the ``build_prompt()`` function below to define your prompt.
|
|
5
|
+
2. Set OUTPUT_PATH to your desired output file path.
|
|
6
|
+
3. Run: uv run python nl_processing/core/scripts/prompt_author.py
|
|
7
|
+
|
|
8
|
+
The output JSON can be loaded by ``nl_processing.core.prompts.load_prompt()``.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
|
|
13
|
+
from langchain_core.load import dumpd
|
|
14
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def build_prompt() -> ChatPromptTemplate:
|
|
18
|
+
"""Define your prompt here. Edit this function for each prompt you author."""
|
|
19
|
+
return ChatPromptTemplate.from_messages([
|
|
20
|
+
("system", "You are a helpful assistant. Respond in {language}."),
|
|
21
|
+
("human", "{input}"),
|
|
22
|
+
])
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def serialize_prompt_to_json(prompt: ChatPromptTemplate, output_path: str) -> None:
|
|
26
|
+
"""Serialize a ChatPromptTemplate to JSON using LangChain native format.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
prompt: The ChatPromptTemplate to serialize.
|
|
30
|
+
output_path: Path where to save the JSON file.
|
|
31
|
+
"""
|
|
32
|
+
data = dumpd(prompt)
|
|
33
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
34
|
+
json.dump(data, f, indent=2)
|
|
35
|
+
f.write("\n")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def save_prompt(prompt: ChatPromptTemplate, output_path: str) -> None:
|
|
39
|
+
"""Serialize a prompt to JSON and print a summary to stdout.
|
|
40
|
+
|
|
41
|
+
Intended for use in ``if __name__ == "__main__"`` blocks of prompt
|
|
42
|
+
generation scripts so each script stays DRY.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
prompt: The ChatPromptTemplate to serialize.
|
|
46
|
+
output_path: Path where to save the JSON file.
|
|
47
|
+
"""
|
|
48
|
+
serialize_prompt_to_json(prompt, output_path)
|
|
49
|
+
print(f"Prompt saved to {output_path}") # noqa: T201
|
|
50
|
+
print(f"Messages: {len(prompt.messages)}") # noqa: T201
|
|
51
|
+
print(f"Input variables: {prompt.input_variables}") # noqa: T201
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
OUTPUT_PATH = "output_prompt.json"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
if __name__ == "__main__":
|
|
58
|
+
_prompt = build_prompt()
|
|
59
|
+
save_prompt(_prompt, OUTPUT_PATH)
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
import cv2
|
|
4
|
+
import numpy
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def generate_test_image(
|
|
8
|
+
text: str,
|
|
9
|
+
output_path: str,
|
|
10
|
+
*,
|
|
11
|
+
width: int = 800,
|
|
12
|
+
height: int = 200,
|
|
13
|
+
font_scale: float = 1.0,
|
|
14
|
+
thickness: int = 2,
|
|
15
|
+
) -> str:
|
|
16
|
+
"""Generate a synthetic test image with known text rendered on it.
|
|
17
|
+
|
|
18
|
+
Returns the output file path.
|
|
19
|
+
"""
|
|
20
|
+
img = numpy.zeros((height, width, 3), dtype=numpy.uint8)
|
|
21
|
+
img.fill(255) # white background
|
|
22
|
+
|
|
23
|
+
lines = text.split("\n")
|
|
24
|
+
y_offset = 40
|
|
25
|
+
line_height = int(40 * font_scale)
|
|
26
|
+
|
|
27
|
+
for line in lines:
|
|
28
|
+
cv2.putText(
|
|
29
|
+
img,
|
|
30
|
+
line,
|
|
31
|
+
(20, y_offset),
|
|
32
|
+
cv2.FONT_HERSHEY_SIMPLEX,
|
|
33
|
+
font_scale,
|
|
34
|
+
(0, 0, 0),
|
|
35
|
+
thickness,
|
|
36
|
+
)
|
|
37
|
+
y_offset += line_height
|
|
38
|
+
|
|
39
|
+
success = cv2.imwrite(output_path, img)
|
|
40
|
+
if not success:
|
|
41
|
+
raise ValueError(f"Failed to write image to {output_path}")
|
|
42
|
+
return output_path
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def normalize_text(text: str) -> str:
|
|
46
|
+
"""Normalize text for comparison: strip whitespace, line breaks, markdown formatting."""
|
|
47
|
+
normalized = re.sub(r"[#*_~`>\-]+", "", text) # remove markdown chars
|
|
48
|
+
normalized = re.sub(r"\s+", " ", normalized) # collapse whitespace
|
|
49
|
+
return normalized.strip().lower()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def evaluate_extraction(extracted: str, ground_truth: str) -> bool:
|
|
53
|
+
"""Compare extracted text against ground truth after normalization.
|
|
54
|
+
|
|
55
|
+
Returns True if exact match after normalization.
|
|
56
|
+
"""
|
|
57
|
+
return normalize_text(extracted) == normalize_text(ground_truth)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import pathlib
|
|
3
|
+
|
|
4
|
+
import cv2
|
|
5
|
+
import numpy
|
|
6
|
+
|
|
7
|
+
from nl_processing.core.exceptions import UnsupportedImageFormatError
|
|
8
|
+
|
|
9
|
+
SUPPORTED_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp"}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_image_format(path: str) -> str:
|
|
13
|
+
"""Return the file extension (lowercase) for the given image path."""
|
|
14
|
+
return pathlib.Path(path).suffix.lower()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def validate_image_format(path: str) -> None:
|
|
18
|
+
"""Validate that the image format is supported by OpenAI Vision API.
|
|
19
|
+
|
|
20
|
+
Raises:
|
|
21
|
+
UnsupportedImageFormatError: If the file extension is not in SUPPORTED_EXTENSIONS.
|
|
22
|
+
"""
|
|
23
|
+
suffix = get_image_format(path)
|
|
24
|
+
if suffix not in SUPPORTED_EXTENSIONS:
|
|
25
|
+
msg = f"Unsupported image format '{suffix}'. Supported formats: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
|
|
26
|
+
raise UnsupportedImageFormatError(msg)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def encode_path_to_base64(path: str) -> tuple[str, str]:
|
|
30
|
+
"""Read an image file and return (base64_string, media_type).
|
|
31
|
+
|
|
32
|
+
Does NOT validate format — caller is responsible for validation.
|
|
33
|
+
"""
|
|
34
|
+
suffix = get_image_format(path)
|
|
35
|
+
media_type = _suffix_to_media_type(suffix)
|
|
36
|
+
with open(path, "rb") as f:
|
|
37
|
+
image_bytes = f.read()
|
|
38
|
+
base64_string = base64.b64encode(image_bytes).decode("utf-8")
|
|
39
|
+
return base64_string, media_type
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def encode_cv2_to_base64(image: numpy.ndarray) -> tuple[str, str]:
|
|
43
|
+
"""Encode an OpenCV image array to base64 PNG.
|
|
44
|
+
|
|
45
|
+
Returns (base64_string, media_type).
|
|
46
|
+
"""
|
|
47
|
+
success, buffer = cv2.imencode(".png", image)
|
|
48
|
+
if not success:
|
|
49
|
+
msg = "Failed to encode image to PNG"
|
|
50
|
+
raise ValueError(msg)
|
|
51
|
+
base64_string = base64.b64encode(buffer.tobytes()).decode("utf-8")
|
|
52
|
+
return base64_string, "image/png"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _suffix_to_media_type(suffix: str) -> str:
|
|
56
|
+
"""Convert file extension to MIME media type."""
|
|
57
|
+
mapping = {
|
|
58
|
+
".png": "image/png",
|
|
59
|
+
".jpg": "image/jpeg",
|
|
60
|
+
".jpeg": "image/jpeg",
|
|
61
|
+
".gif": "image/gif",
|
|
62
|
+
".webp": "image/webp",
|
|
63
|
+
}
|
|
64
|
+
return mapping[suffix]
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Generate the Dutch extraction prompt (nl.json) with few-shot examples.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
uv run python nl_processing/extract_text_from_image/prompts/generate_nl_prompt.py
|
|
5
|
+
|
|
6
|
+
This script:
|
|
7
|
+
1. Generates synthetic test images and encodes real photos
|
|
8
|
+
2. Encodes them to base64
|
|
9
|
+
3. Builds a ChatPromptTemplate with 5 few-shot examples (HumanMessage + AIMessage + ToolMessage triplets)
|
|
10
|
+
4. Serializes with dumpd() and saves to nl.json
|
|
11
|
+
|
|
12
|
+
The script is the source of truth — nl.json is the generated artifact.
|
|
13
|
+
Re-run this script whenever example text or image parameters change.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
import tempfile
|
|
18
|
+
|
|
19
|
+
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
|
|
20
|
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
21
|
+
|
|
22
|
+
from nl_processing.extract_text_from_image.benchmark import generate_test_image
|
|
23
|
+
from nl_processing.extract_text_from_image.image_encoding import encode_path_to_base64
|
|
24
|
+
|
|
25
|
+
SYSTEM_INSTRUCTION = (
|
|
26
|
+
"Je bent een tekst-extractie assistent. "
|
|
27
|
+
"Extraheer alleen de Nederlandse tekst uit het aangeboden beeld. "
|
|
28
|
+
"Behoud de originele documentstructuur als markdown "
|
|
29
|
+
"(koppen, nadruk, regelafbrekingen). "
|
|
30
|
+
"Negeer tekst in andere talen. "
|
|
31
|
+
"Retourneer alleen de geëxtraheerde tekst, zonder commentaar of uitleg."
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
EXAMPLE_1_TEXT = "De kat zit op de mat"
|
|
35
|
+
EXAMPLE_1_EXPECTED = "De kat zit op de mat"
|
|
36
|
+
|
|
37
|
+
EXAMPLE_2_TEXT = "Welkom bij ons\nДобро пожаловать"
|
|
38
|
+
EXAMPLE_2_EXPECTED = "Welkom bij ons"
|
|
39
|
+
|
|
40
|
+
EXAMPLE_3_IMAGE = Path(__file__).parent / "examples" / "dutch_handwritten_mixed.jpg"
|
|
41
|
+
EXAMPLE_3_EXPECTED = (
|
|
42
|
+
"getal, het\n"
|
|
43
|
+
"getrouwd\n"
|
|
44
|
+
"niet\n"
|
|
45
|
+
"nieuw\n"
|
|
46
|
+
"mooi\n"
|
|
47
|
+
"hoog\n"
|
|
48
|
+
"baan\n"
|
|
49
|
+
"kunst\n"
|
|
50
|
+
"heel\n"
|
|
51
|
+
"leren kennen\n"
|
|
52
|
+
"eeuw\n"
|
|
53
|
+
"moe\n"
|
|
54
|
+
"vroeg\n"
|
|
55
|
+
"ver\n"
|
|
56
|
+
"daar\n"
|
|
57
|
+
"tijd\n"
|
|
58
|
+
"lezen"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
EXAMPLE_4_IMAGE = Path(__file__).parent / "examples" / "dutch_vocabulary_wide.jpg"
|
|
62
|
+
EXAMPLE_4_EXPECTED = (
|
|
63
|
+
"vandaan\n"
|
|
64
|
+
"veranderen\n"
|
|
65
|
+
"verbeteren\n"
|
|
66
|
+
"vlakbij\n"
|
|
67
|
+
"volgorde, de\n"
|
|
68
|
+
"voorbeeld, het\n"
|
|
69
|
+
"voornaam, de\n"
|
|
70
|
+
"vorm, de\n"
|
|
71
|
+
"vraag, de\n"
|
|
72
|
+
"vriendin, de\n"
|
|
73
|
+
"vrouw, de\n"
|
|
74
|
+
"wat\n"
|
|
75
|
+
"week, de\n"
|
|
76
|
+
"welkom\n"
|
|
77
|
+
"werken\n"
|
|
78
|
+
"wonen\n"
|
|
79
|
+
"woonplaats, de\n"
|
|
80
|
+
"woord, het\n"
|
|
81
|
+
"ze\n"
|
|
82
|
+
"zeggen\n"
|
|
83
|
+
"zij\n"
|
|
84
|
+
"zijn\n"
|
|
85
|
+
"zijn\n"
|
|
86
|
+
"zin, de"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
EXAMPLE_5_TEXT = "The quick brown fox jumps over the lazy dog"
|
|
90
|
+
EXAMPLE_5_EXPECTED = ""
|
|
91
|
+
|
|
92
|
+
OUTPUT_PATH = Path(__file__).parent / "nl.json"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _generate_image_b64(text: str, *, width: int = 800, height: int = 200) -> str:
|
|
96
|
+
"""Generate a synthetic image and return its base64 data URL."""
|
|
97
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
98
|
+
img_path = str(Path(tmpdir) / "image.png")
|
|
99
|
+
generate_test_image(text, img_path, width=width, height=height, font_scale=1.2)
|
|
100
|
+
b64, media_type = encode_path_to_base64(img_path)
|
|
101
|
+
return f"data:{media_type};base64,{b64}"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _encode_existing_image_b64(path: Path) -> str:
|
|
105
|
+
"""Encode an existing image file and return its base64 data URL."""
|
|
106
|
+
b64, media_type = encode_path_to_base64(str(path))
|
|
107
|
+
return f"data:{media_type};base64,{b64}"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _make_example_human(image_data_url: str) -> HumanMessage:
|
|
111
|
+
"""Create a HumanMessage with an image content block."""
|
|
112
|
+
return HumanMessage(
|
|
113
|
+
content=[
|
|
114
|
+
{"type": "image_url", "image_url": {"url": image_data_url}},
|
|
115
|
+
]
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _make_example_ai(expected_text: str, call_id: str) -> AIMessage:
|
|
120
|
+
"""Create an AIMessage with a tool_call for ExtractedText."""
|
|
121
|
+
return AIMessage(
|
|
122
|
+
content="",
|
|
123
|
+
tool_calls=[{"name": "ExtractedText", "args": {"text": expected_text}, "id": call_id}],
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def build_prompt() -> ChatPromptTemplate:
|
|
128
|
+
"""Build the Dutch extraction prompt with 5 few-shot examples."""
|
|
129
|
+
img1 = _generate_image_b64(EXAMPLE_1_TEXT)
|
|
130
|
+
img2 = _generate_image_b64(EXAMPLE_2_TEXT)
|
|
131
|
+
img3 = _encode_existing_image_b64(EXAMPLE_3_IMAGE)
|
|
132
|
+
img4 = _encode_existing_image_b64(EXAMPLE_4_IMAGE)
|
|
133
|
+
img5 = _generate_image_b64(EXAMPLE_5_TEXT)
|
|
134
|
+
|
|
135
|
+
return ChatPromptTemplate.from_messages([
|
|
136
|
+
SystemMessage(content=SYSTEM_INSTRUCTION),
|
|
137
|
+
_make_example_human(img1),
|
|
138
|
+
_make_example_ai(EXAMPLE_1_EXPECTED, "call_example_1"),
|
|
139
|
+
ToolMessage(content=EXAMPLE_1_EXPECTED, tool_call_id="call_example_1"),
|
|
140
|
+
_make_example_human(img2),
|
|
141
|
+
_make_example_ai(EXAMPLE_2_EXPECTED, "call_example_2"),
|
|
142
|
+
ToolMessage(content=EXAMPLE_2_EXPECTED, tool_call_id="call_example_2"),
|
|
143
|
+
_make_example_human(img3),
|
|
144
|
+
_make_example_ai(EXAMPLE_3_EXPECTED, "call_example_3"),
|
|
145
|
+
ToolMessage(content=EXAMPLE_3_EXPECTED, tool_call_id="call_example_3"),
|
|
146
|
+
_make_example_human(img4),
|
|
147
|
+
_make_example_ai(EXAMPLE_4_EXPECTED, "call_example_4"),
|
|
148
|
+
ToolMessage(content=EXAMPLE_4_EXPECTED, tool_call_id="call_example_4"),
|
|
149
|
+
_make_example_human(img5),
|
|
150
|
+
_make_example_ai(EXAMPLE_5_EXPECTED, "call_example_5"),
|
|
151
|
+
ToolMessage(content=EXAMPLE_5_EXPECTED, tool_call_id="call_example_5"),
|
|
152
|
+
MessagesPlaceholder(variable_name="images"),
|
|
153
|
+
])
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
if __name__ == "__main__":
|
|
157
|
+
from nl_processing.core.scripts.prompt_author import save_prompt
|
|
158
|
+
|
|
159
|
+
save_prompt(build_prompt(), str(OUTPUT_PATH))
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
|
|
3
|
+
from langchain_core.messages import HumanMessage
|
|
4
|
+
from langchain_openai import ChatOpenAI
|
|
5
|
+
import numpy
|
|
6
|
+
|
|
7
|
+
from nl_processing.core.exceptions import APIError, TargetLanguageNotFoundError
|
|
8
|
+
from nl_processing.core.models import ExtractedText, Language
|
|
9
|
+
from nl_processing.core.prompts import load_prompt
|
|
10
|
+
from nl_processing.extract_text_from_image.image_encoding import (
|
|
11
|
+
encode_cv2_to_base64,
|
|
12
|
+
encode_path_to_base64,
|
|
13
|
+
validate_image_format,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
# Resolve prompts directory relative to this file
|
|
17
|
+
_PROMPTS_DIR = pathlib.Path(__file__).parent / "prompts"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ImageTextExtractor:
|
|
21
|
+
"""Extract language-specific text from images using OpenAI Vision API.
|
|
22
|
+
|
|
23
|
+
Usage:
|
|
24
|
+
extractor = ImageTextExtractor()
|
|
25
|
+
text = await extractor.extract_from_path("image.png")
|
|
26
|
+
text = await extractor.extract_from_cv2(cv2_image)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
*,
|
|
32
|
+
language: Language = Language.NL,
|
|
33
|
+
model: str = "gpt-4.1-mini",
|
|
34
|
+
reasoning_effort: str | None = None,
|
|
35
|
+
service_tier: str | None = None,
|
|
36
|
+
temperature: float | None = 0,
|
|
37
|
+
) -> None:
|
|
38
|
+
self._language = language
|
|
39
|
+
prompt_path = str(_PROMPTS_DIR / f"{language.value}.json")
|
|
40
|
+
prompt = load_prompt(prompt_path)
|
|
41
|
+
|
|
42
|
+
llm = ChatOpenAI(
|
|
43
|
+
model=model, service_tier=service_tier, reasoning_effort=reasoning_effort, temperature=temperature
|
|
44
|
+
).bind_tools([ExtractedText], tool_choice=ExtractedText.__name__)
|
|
45
|
+
|
|
46
|
+
self._chain = prompt | llm
|
|
47
|
+
|
|
48
|
+
async def extract_from_path(self, path: str) -> str:
|
|
49
|
+
"""Extract text from image at the given file path.
|
|
50
|
+
|
|
51
|
+
Returns markdown-formatted text in the target language.
|
|
52
|
+
"""
|
|
53
|
+
validate_image_format(path)
|
|
54
|
+
base64_string, media_type = encode_path_to_base64(path)
|
|
55
|
+
return await self._aextract(base64_string, media_type)
|
|
56
|
+
|
|
57
|
+
async def extract_from_cv2(self, image: "numpy.ndarray") -> str:
|
|
58
|
+
"""Extract text from OpenCV image array.
|
|
59
|
+
|
|
60
|
+
Returns markdown-formatted text in the target language.
|
|
61
|
+
"""
|
|
62
|
+
base64_string, media_type = encode_cv2_to_base64(image)
|
|
63
|
+
return await self._aextract(base64_string, media_type)
|
|
64
|
+
|
|
65
|
+
async def _aextract(self, base64_string: str, media_type: str) -> str:
|
|
66
|
+
"""Internal: run the extraction chain with the base64 image."""
|
|
67
|
+
human_message = HumanMessage(
|
|
68
|
+
content=[
|
|
69
|
+
{"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{base64_string}"}},
|
|
70
|
+
]
|
|
71
|
+
)
|
|
72
|
+
try:
|
|
73
|
+
response = await self._chain.ainvoke({"images": [human_message]})
|
|
74
|
+
result = ExtractedText(**response.tool_calls[0]["args"]) # type: ignore[attr-defined]
|
|
75
|
+
except Exception as e:
|
|
76
|
+
raise APIError(str(e)) from e
|
|
77
|
+
|
|
78
|
+
# Check if target language text was found
|
|
79
|
+
if not result.text.strip():
|
|
80
|
+
msg = "No text in the target language was found in the image"
|
|
81
|
+
raise TargetLanguageNotFoundError(msg)
|
|
82
|
+
|
|
83
|
+
return result.text
|
|
File without changes
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""Generate the Dutch word extraction prompt (nl.json) with few-shot examples.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
uv run python nl_processing/extract_words_from_text/prompts/generate_nl_prompt.py
|
|
5
|
+
|
|
6
|
+
The script is the source of truth -- nl.json is the generated artifact.
|
|
7
|
+
Re-run this script whenever prompt content changes.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
|
|
14
|
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
15
|
+
|
|
16
|
+
SYSTEM_INSTRUCTION = (
|
|
17
|
+
"Je bent een woord-extractie assistent voor de Nederlandse taal. "
|
|
18
|
+
"Je taak is om alle Nederlandse woorden uit de aangeboden tekst te extraheren en te normaliseren.\n\n"
|
|
19
|
+
"Regels:\n"
|
|
20
|
+
"- Negeer alle markdown-opmaak (koppen, vet, cursief, lijsten) -- extraheer alleen taalkundige inhoud.\n"
|
|
21
|
+
"- Negeer tekst in andere talen dan Nederlands.\n"
|
|
22
|
+
"- Normaliseer elk woord volgens de Nederlandse regels:\n"
|
|
23
|
+
" - Zelfstandige naamwoorden: met lidwoord (de/het), bijv. 'de fiets', 'het huis'\n"
|
|
24
|
+
" - Werkwoorden: infinitief, bijv. 'lopen', 'hebben'\n"
|
|
25
|
+
" - Bijvoeglijke naamwoorden: basisvorm, bijv. 'groot', 'klein'\n"
|
|
26
|
+
" - Voorzetsels, voegwoorden, bijwoorden: basisvorm\n"
|
|
27
|
+
" - Eigennamen (personen/merken): ongewijzigd, type 'proper_noun_person'\n"
|
|
28
|
+
" - Eigennamen (landen): ongewijzigd, type 'proper_noun_country'\n"
|
|
29
|
+
"- Extraheer samengestelde uitdrukkingen en fraseologische constructies als enkele eenheden.\n"
|
|
30
|
+
"- Wijs een plat woordtype toe aan elk woord. Mogelijke types: "
|
|
31
|
+
"noun, verb, adjective, adverb, preposition, conjunction, pronoun, article, numeral, "
|
|
32
|
+
"proper_noun_person, proper_noun_country.\n"
|
|
33
|
+
"- Retourneer het resultaat als een lijst van WordEntry objecten in een _WordList wrapper.\n"
|
|
34
|
+
"- Als de tekst geen Nederlandse woorden bevat, retourneer dan een lege lijst."
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
TOOL_NAME = "_WordList"
|
|
38
|
+
OUTPUT_PATH = Path(__file__).parent / "nl.json"
|
|
39
|
+
|
|
40
|
+
_W = dict[str, str]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _w(form: str, wtype: str) -> _W:
|
|
44
|
+
return {"normalized_form": form, "word_type": wtype}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# fmt: off
|
|
48
|
+
EXAMPLES: list[tuple[str, list[_W]]] = [
|
|
49
|
+
# 1: Simple sentence — noun (de/het), verb, adjective
|
|
50
|
+
("De grote kat loopt snel.", [
|
|
51
|
+
_w("de kat", "noun"), _w("groot", "adjective"),
|
|
52
|
+
_w("lopen", "verb"), _w("snel", "adverb"),
|
|
53
|
+
]),
|
|
54
|
+
# 2: Proper nouns and prepositions
|
|
55
|
+
("Jan woont in Nederland.", [
|
|
56
|
+
_w("Jan", "proper_noun_person"), _w("wonen", "verb"),
|
|
57
|
+
_w("in", "preposition"), _w("Nederland", "proper_noun_country"),
|
|
58
|
+
]),
|
|
59
|
+
# 3: Compound expression
|
|
60
|
+
("Zij gaat er vandoor met haar vriend.", [
|
|
61
|
+
_w("zij", "pronoun"), _w("ervandoor gaan", "verb"),
|
|
62
|
+
_w("met", "preposition"), _w("haar", "pronoun"), _w("de vriend", "noun"),
|
|
63
|
+
]),
|
|
64
|
+
# 4: Non-Dutch text — empty list
|
|
65
|
+
("The quick brown fox jumps over the lazy dog.", []),
|
|
66
|
+
# 5: Mixed markdown with various word types
|
|
67
|
+
("# Welkom\n\nHet **kleine** kind speelt vrolijk in de tuin.", [
|
|
68
|
+
_w("welkom", "adjective"), _w("het kind", "noun"), _w("klein", "adjective"),
|
|
69
|
+
_w("spelen", "verb"), _w("vrolijk", "adverb"),
|
|
70
|
+
_w("in", "preposition"), _w("de tuin", "noun"),
|
|
71
|
+
]),
|
|
72
|
+
# 6: Product packaging prose — brand names, adjectives as base form, plurals singularized
|
|
73
|
+
(
|
|
74
|
+
"Met De Ruijter kunt u elke dag genieten "
|
|
75
|
+
"van een breed assortiment smakelijke producten.\n"
|
|
76
|
+
"Chocoladevlokken Melk en Puur\n"
|
|
77
|
+
"Chocoladehagel Melk en Puur\n"
|
|
78
|
+
"Vruchtenhagel\nAnijshagel\nVlokfeest\n"
|
|
79
|
+
"Gestampte Muisjes\nRose en Witte Muisjes\nBlauwe en Witte Muisjes",
|
|
80
|
+
[
|
|
81
|
+
_w("met", "preposition"), _w("De Ruijter", "proper_noun_person"),
|
|
82
|
+
_w("kunnen", "verb"), _w("u", "pronoun"), _w("elk", "adjective"),
|
|
83
|
+
_w("de dag", "noun"), _w("genieten", "verb"), _w("van", "preposition"),
|
|
84
|
+
_w("een", "article"), _w("breed", "adjective"),
|
|
85
|
+
_w("het assortiment", "noun"), _w("smakelijk", "adjective"),
|
|
86
|
+
_w("het product", "noun"), _w("de chocoladevlokken", "noun"),
|
|
87
|
+
_w("de melk", "noun"), _w("en", "conjunction"), _w("puur", "adjective"),
|
|
88
|
+
_w("de chocoladehagel", "noun"), _w("de vruchtenhagel", "noun"),
|
|
89
|
+
_w("de anijshagel", "noun"), _w("het vlokfeest", "noun"),
|
|
90
|
+
_w("gestampt", "adjective"), _w("het muisje", "noun"),
|
|
91
|
+
_w("roze", "adjective"), _w("wit", "adjective"), _w("blauw", "adjective"),
|
|
92
|
+
],
|
|
93
|
+
),
|
|
94
|
+
]
|
|
95
|
+
# fmt: on
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _make_ai(words: list[_W], call_id: str) -> AIMessage:
|
|
99
|
+
return AIMessage(
|
|
100
|
+
content="",
|
|
101
|
+
tool_calls=[{"name": TOOL_NAME, "args": {"words": words}, "id": call_id}],
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _make_ack(words: list[_W], call_id: str) -> ToolMessage:
|
|
106
|
+
return ToolMessage(content=json.dumps({"words": words}, ensure_ascii=False), tool_call_id=call_id)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def build_prompt() -> ChatPromptTemplate:
|
|
110
|
+
"""Build the Dutch word extraction prompt with few-shot examples."""
|
|
111
|
+
messages: list = [SystemMessage(content=SYSTEM_INSTRUCTION)]
|
|
112
|
+
for i, (text, words) in enumerate(EXAMPLES, 1):
|
|
113
|
+
cid = f"call_ex_{i}"
|
|
114
|
+
messages += [HumanMessage(content=text), _make_ai(words, cid), _make_ack(words, cid)]
|
|
115
|
+
messages.append(MessagesPlaceholder(variable_name="text"))
|
|
116
|
+
return ChatPromptTemplate.from_messages(messages)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
if __name__ == "__main__":
|
|
120
|
+
from nl_processing.core.scripts.prompt_author import save_prompt
|
|
121
|
+
|
|
122
|
+
save_prompt(build_prompt(), str(OUTPUT_PATH))
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
|
|
3
|
+
from langchain_core.messages import HumanMessage
|
|
4
|
+
from langchain_openai import ChatOpenAI
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
from nl_processing.core.exceptions import APIError
|
|
8
|
+
from nl_processing.core.models import Language, WordEntry
|
|
9
|
+
from nl_processing.core.prompts import load_prompt
|
|
10
|
+
|
|
11
|
+
# Resolve prompts directory relative to this file
|
|
12
|
+
_PROMPTS_DIR = pathlib.Path(__file__).parent / "prompts"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class _WordList(BaseModel):
|
|
16
|
+
"""Internal wrapper: bind_tools needs a single model, output is a list."""
|
|
17
|
+
|
|
18
|
+
words: list[WordEntry]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class WordExtractor:
|
|
22
|
+
"""Extract and normalize words from markdown text.
|
|
23
|
+
|
|
24
|
+
Usage:
|
|
25
|
+
extractor = WordExtractor()
|
|
26
|
+
words = await extractor.extract(text)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
*,
|
|
32
|
+
language: Language = Language.NL,
|
|
33
|
+
model: str = "gpt-4.1-mini",
|
|
34
|
+
) -> None:
|
|
35
|
+
self._language = language
|
|
36
|
+
prompt_path = str(_PROMPTS_DIR / f"{language.value}.json")
|
|
37
|
+
prompt = load_prompt(prompt_path)
|
|
38
|
+
|
|
39
|
+
llm = ChatOpenAI(model=model, temperature=0).bind_tools([_WordList], tool_choice=_WordList.__name__)
|
|
40
|
+
self._chain = prompt | llm
|
|
41
|
+
|
|
42
|
+
async def extract(self, text: str) -> list[WordEntry]:
|
|
43
|
+
"""Extract and normalize words from the given text.
|
|
44
|
+
|
|
45
|
+
Returns a list of WordEntry objects with normalized forms and types.
|
|
46
|
+
Returns an empty list if no words in the target language are found.
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
response = await self._chain.ainvoke({"text": [HumanMessage(content=text)]})
|
|
50
|
+
result = _WordList(**response.tool_calls[0]["args"]) # type: ignore[attr-defined]
|
|
51
|
+
except Exception as e:
|
|
52
|
+
raise APIError(str(e)) from e
|
|
53
|
+
|
|
54
|
+
return result.words
|
|
File without changes
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Generate the Dutch→Russian translation prompt (nl_ru.json) with few-shot examples.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
uv run python nl_processing/translate_text/prompts/generate_nl_ru_prompt.py
|
|
5
|
+
|
|
6
|
+
This script:
|
|
7
|
+
1. Defines the system instruction (in Russian) for a professional NL→RU translator
|
|
8
|
+
2. Builds few-shot examples as HumanMessage + AIMessage + ToolMessage triplets
|
|
9
|
+
3. Serializes with dumpd() and saves to nl_ru.json
|
|
10
|
+
|
|
11
|
+
The script is the source of truth — nl_ru.json is the generated artifact.
|
|
12
|
+
Re-run this script whenever examples or system instruction change.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
|
|
18
|
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
19
|
+
|
|
20
|
+
SYSTEM_INSTRUCTION = (
|
|
21
|
+
"Вы — профессиональный переводчик с нидерландского на русский язык. "
|
|
22
|
+
"Переведите предоставленный текст естественно, сохраняя смысл близко к оригиналу. "
|
|
23
|
+
"Сохраняйте всё форматирование markdown (заголовки, жирный, курсив, списки, разрывы абзацев). "
|
|
24
|
+
"Верните только переведённый текст — без комментариев, пояснений и префиксов. "
|
|
25
|
+
"Если ввод пуст или не содержит нидерландского текста, верните пустую строку."
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
TOOL_NAME = "_TranslatedText"
|
|
29
|
+
|
|
30
|
+
EXAMPLE_1_INPUT = "De zon schijnt vandaag."
|
|
31
|
+
EXAMPLE_1_OUTPUT = "Сегодня светит солнце."
|
|
32
|
+
|
|
33
|
+
EXAMPLE_2_INPUT = "# Welkom\n\nDit is een **belangrijk** bericht."
|
|
34
|
+
EXAMPLE_2_OUTPUT = "# Добро пожаловать\n\nЭто **важное** сообщение."
|
|
35
|
+
|
|
36
|
+
EXAMPLE_3_INPUT = "Wat heb je nodig:\n\n- *Melk*\n- *Brood*\n- *Kaas*"
|
|
37
|
+
EXAMPLE_3_OUTPUT = "Что тебе нужно:\n\n- *Молоко*\n- *Хлеб*\n- *Сыр*"
|
|
38
|
+
|
|
39
|
+
EXAMPLE_4_INPUT = ""
|
|
40
|
+
EXAMPLE_4_OUTPUT = ""
|
|
41
|
+
|
|
42
|
+
EXAMPLE_5_INPUT = "The quick brown fox jumps over the lazy dog."
|
|
43
|
+
EXAMPLE_5_OUTPUT = ""
|
|
44
|
+
|
|
45
|
+
OUTPUT_PATH = Path(__file__).parent / "nl_ru.json"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _make_example_ai(translated_text: str, call_id: str) -> AIMessage:
|
|
49
|
+
"""Create an AIMessage with a tool_call for _TranslatedText."""
|
|
50
|
+
return AIMessage(
|
|
51
|
+
content="",
|
|
52
|
+
tool_calls=[{"name": TOOL_NAME, "args": {"text": translated_text}, "id": call_id}],
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def build_prompt() -> ChatPromptTemplate:
|
|
57
|
+
"""Build the Dutch→Russian translation prompt with 5 few-shot examples."""
|
|
58
|
+
return ChatPromptTemplate.from_messages([
|
|
59
|
+
SystemMessage(content=SYSTEM_INSTRUCTION),
|
|
60
|
+
# Example 1: simple sentence
|
|
61
|
+
HumanMessage(content=EXAMPLE_1_INPUT),
|
|
62
|
+
_make_example_ai(EXAMPLE_1_OUTPUT, "call_example_1"),
|
|
63
|
+
ToolMessage(content=EXAMPLE_1_OUTPUT, tool_call_id="call_example_1"),
|
|
64
|
+
# Example 2: markdown with headings and bold
|
|
65
|
+
HumanMessage(content=EXAMPLE_2_INPUT),
|
|
66
|
+
_make_example_ai(EXAMPLE_2_OUTPUT, "call_example_2"),
|
|
67
|
+
ToolMessage(content=EXAMPLE_2_OUTPUT, tool_call_id="call_example_2"),
|
|
68
|
+
# Example 3: list + italic
|
|
69
|
+
HumanMessage(content=EXAMPLE_3_INPUT),
|
|
70
|
+
_make_example_ai(EXAMPLE_3_OUTPUT, "call_example_3"),
|
|
71
|
+
ToolMessage(content=EXAMPLE_3_OUTPUT, tool_call_id="call_example_3"),
|
|
72
|
+
# Example 4: empty input
|
|
73
|
+
HumanMessage(content=EXAMPLE_4_INPUT),
|
|
74
|
+
_make_example_ai(EXAMPLE_4_OUTPUT, "call_example_4"),
|
|
75
|
+
ToolMessage(content=EXAMPLE_4_OUTPUT, tool_call_id="call_example_4"),
|
|
76
|
+
# Example 5: non-Dutch text
|
|
77
|
+
HumanMessage(content=EXAMPLE_5_INPUT),
|
|
78
|
+
_make_example_ai(EXAMPLE_5_OUTPUT, "call_example_5"),
|
|
79
|
+
ToolMessage(content=EXAMPLE_5_OUTPUT, tool_call_id="call_example_5"),
|
|
80
|
+
# Placeholder for actual input
|
|
81
|
+
MessagesPlaceholder(variable_name="text"),
|
|
82
|
+
])
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
if __name__ == "__main__":
|
|
86
|
+
from nl_processing.core.scripts.prompt_author import save_prompt
|
|
87
|
+
|
|
88
|
+
save_prompt(build_prompt(), str(OUTPUT_PATH))
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
|
|
3
|
+
from langchain_core.messages import HumanMessage
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from nl_processing.core.exceptions import APIError
|
|
7
|
+
from nl_processing.core.models import Language
|
|
8
|
+
from nl_processing.core.prompts import build_translation_chain
|
|
9
|
+
|
|
10
|
+
_PROMPTS_DIR = pathlib.Path(__file__).parent / "prompts"
|
|
11
|
+
|
|
12
|
+
_SUPPORTED_PAIRS: set[tuple[str, str]] = {("nl", "ru")}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class _TranslatedText(BaseModel):
|
|
16
|
+
text: str
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TextTranslator:
|
|
20
|
+
"""Translate text between languages with markdown preservation.
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
translator = TextTranslator(
|
|
24
|
+
source_language=Language.NL,
|
|
25
|
+
target_language=Language.RU,
|
|
26
|
+
)
|
|
27
|
+
result = await translator.translate(dutch_text)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, *, source_language: Language, target_language: Language, model: str = "gpt-4.1-mini") -> None:
|
|
31
|
+
self._source_language = source_language
|
|
32
|
+
self._target_language = target_language
|
|
33
|
+
self._chain = build_translation_chain(
|
|
34
|
+
source_language=source_language,
|
|
35
|
+
target_language=target_language,
|
|
36
|
+
supported_pairs=_SUPPORTED_PAIRS,
|
|
37
|
+
prompts_dir=_PROMPTS_DIR,
|
|
38
|
+
tool_schema=_TranslatedText,
|
|
39
|
+
model=model,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
async def translate(self, text: str) -> str:
|
|
43
|
+
"""Translate text from source to target language.
|
|
44
|
+
|
|
45
|
+
Returns the translated text or empty string for empty/non-source input.
|
|
46
|
+
"""
|
|
47
|
+
if not text.strip():
|
|
48
|
+
return ""
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
response = await self._chain.ainvoke({"text": [HumanMessage(content=text)]})
|
|
52
|
+
result = _TranslatedText(
|
|
53
|
+
**response.tool_calls[0]["args"] # type: ignore[attr-defined]
|
|
54
|
+
)
|
|
55
|
+
except Exception as e:
|
|
56
|
+
raise APIError(str(e)) from e
|
|
57
|
+
|
|
58
|
+
return result.text
|
|
File without changes
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Generate the Dutch-to-Russian word translation prompt (nl_ru.json).
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
uv run python nl_processing/translate_word/prompts/generate_nl_ru_prompt.py
|
|
5
|
+
|
|
6
|
+
This script:
|
|
7
|
+
1. Builds a ChatPromptTemplate with system instruction and 4 few-shot examples
|
|
8
|
+
2. Demonstrates one-to-one order-preserving word translation
|
|
9
|
+
3. Serializes with dumpd() and saves to nl_ru.json
|
|
10
|
+
|
|
11
|
+
The script is the source of truth -- nl_ru.json is the generated artifact.
|
|
12
|
+
Re-run this script whenever prompt content changes.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
|
|
19
|
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
20
|
+
|
|
21
|
+
SYSTEM_INSTRUCTION = (
|
|
22
|
+
"Вы — профессиональный переводчик с нидерландского языка на русский. "
|
|
23
|
+
"Вы получаете список нидерландских слов или фраз и должны перевести каждое на русский. "
|
|
24
|
+
"Верните ровно один перевод для каждого входного слова, в том же порядке. "
|
|
25
|
+
"Количество переводов в результате должно равняться количеству слов на входе. "
|
|
26
|
+
"Каждый перевод должен быть наиболее распространённым, естественным русским эквивалентом. "
|
|
27
|
+
"Если входной список пуст, верните пустой список."
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
EXAMPLE_1_INPUT = "huis\nlopen\nsnel"
|
|
31
|
+
EXAMPLE_1_OUTPUT = [
|
|
32
|
+
{"translation": "дом"},
|
|
33
|
+
{"translation": "ходить"},
|
|
34
|
+
{"translation": "быстро"},
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
EXAMPLE_2_INPUT = "de kat\nhet boek\nschrijven\nmooi\nin"
|
|
38
|
+
EXAMPLE_2_OUTPUT = [
|
|
39
|
+
{"translation": "кошка"},
|
|
40
|
+
{"translation": "книга"},
|
|
41
|
+
{"translation": "писать"},
|
|
42
|
+
{"translation": "красивый"},
|
|
43
|
+
{"translation": "в"},
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
EXAMPLE_3_INPUT = "er vandoor gaan\nde fiets"
|
|
47
|
+
EXAMPLE_3_OUTPUT = [
|
|
48
|
+
{"translation": "сбежать"},
|
|
49
|
+
{"translation": "велосипед"},
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
EXAMPLE_4_INPUT = ""
|
|
53
|
+
EXAMPLE_4_OUTPUT: list[dict[str, str]] = []
|
|
54
|
+
|
|
55
|
+
OUTPUT_PATH = Path(__file__).parent / "nl_ru.json"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _make_example_ai(translations: list[dict[str, str]], call_id: str) -> AIMessage:
|
|
59
|
+
"""Create an AIMessage with a tool_call for _TranslationBatch."""
|
|
60
|
+
return AIMessage(
|
|
61
|
+
content="",
|
|
62
|
+
tool_calls=[
|
|
63
|
+
{
|
|
64
|
+
"name": "_TranslationBatch",
|
|
65
|
+
"args": {"translations": translations},
|
|
66
|
+
"id": call_id,
|
|
67
|
+
}
|
|
68
|
+
],
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def build_prompt() -> ChatPromptTemplate:
|
|
73
|
+
"""Build the Dutch-to-Russian word translation prompt with few-shot examples."""
|
|
74
|
+
return ChatPromptTemplate.from_messages([
|
|
75
|
+
SystemMessage(content=SYSTEM_INSTRUCTION),
|
|
76
|
+
# Example 1: 3 simple words
|
|
77
|
+
HumanMessage(content=EXAMPLE_1_INPUT),
|
|
78
|
+
_make_example_ai(EXAMPLE_1_OUTPUT, "call_example_1"),
|
|
79
|
+
ToolMessage(
|
|
80
|
+
content=json.dumps({"translations": EXAMPLE_1_OUTPUT}, ensure_ascii=False),
|
|
81
|
+
tool_call_id="call_example_1",
|
|
82
|
+
),
|
|
83
|
+
# Example 2: 5 words with articles
|
|
84
|
+
HumanMessage(content=EXAMPLE_2_INPUT),
|
|
85
|
+
_make_example_ai(EXAMPLE_2_OUTPUT, "call_example_2"),
|
|
86
|
+
ToolMessage(
|
|
87
|
+
content=json.dumps({"translations": EXAMPLE_2_OUTPUT}, ensure_ascii=False),
|
|
88
|
+
tool_call_id="call_example_2",
|
|
89
|
+
),
|
|
90
|
+
# Example 3: compound expression
|
|
91
|
+
HumanMessage(content=EXAMPLE_3_INPUT),
|
|
92
|
+
_make_example_ai(EXAMPLE_3_OUTPUT, "call_example_3"),
|
|
93
|
+
ToolMessage(
|
|
94
|
+
content=json.dumps({"translations": EXAMPLE_3_OUTPUT}, ensure_ascii=False),
|
|
95
|
+
tool_call_id="call_example_3",
|
|
96
|
+
),
|
|
97
|
+
# Example 4: empty list
|
|
98
|
+
HumanMessage(content=EXAMPLE_4_INPUT),
|
|
99
|
+
_make_example_ai(EXAMPLE_4_OUTPUT, "call_example_4"),
|
|
100
|
+
ToolMessage(
|
|
101
|
+
content=json.dumps({"translations": EXAMPLE_4_OUTPUT}, ensure_ascii=False),
|
|
102
|
+
tool_call_id="call_example_4",
|
|
103
|
+
),
|
|
104
|
+
# Placeholder for actual input
|
|
105
|
+
MessagesPlaceholder(variable_name="text"),
|
|
106
|
+
])
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
if __name__ == "__main__":
|
|
110
|
+
from nl_processing.core.scripts.prompt_author import save_prompt
|
|
111
|
+
|
|
112
|
+
save_prompt(build_prompt(), str(OUTPUT_PATH))
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
|
|
3
|
+
from langchain_core.messages import HumanMessage
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from nl_processing.core.exceptions import APIError
|
|
7
|
+
from nl_processing.core.models import Language, TranslationResult
|
|
8
|
+
from nl_processing.core.prompts import build_translation_chain
|
|
9
|
+
|
|
10
|
+
_PROMPTS_DIR = pathlib.Path(__file__).parent / "prompts"
|
|
11
|
+
|
|
12
|
+
_SUPPORTED_PAIRS: set[tuple[str, str]] = {("nl", "ru")}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class _TranslationBatch(BaseModel):
|
|
16
|
+
translations: list[TranslationResult]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class WordTranslator:
|
|
20
|
+
"""Translate word batches between languages.
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
translator = WordTranslator(
|
|
24
|
+
source_language=Language.NL,
|
|
25
|
+
target_language=Language.RU,
|
|
26
|
+
)
|
|
27
|
+
results = await translator.translate(["huis", "lopen"])
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
*,
|
|
33
|
+
source_language: Language,
|
|
34
|
+
target_language: Language,
|
|
35
|
+
model: str = "gpt-4.1-mini",
|
|
36
|
+
) -> None:
|
|
37
|
+
self._source_language = source_language
|
|
38
|
+
self._target_language = target_language
|
|
39
|
+
self._chain = build_translation_chain(
|
|
40
|
+
source_language=source_language,
|
|
41
|
+
target_language=target_language,
|
|
42
|
+
supported_pairs=_SUPPORTED_PAIRS,
|
|
43
|
+
prompts_dir=_PROMPTS_DIR,
|
|
44
|
+
tool_schema=_TranslationBatch,
|
|
45
|
+
model=model,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
async def translate(self, words: list[str]) -> list[TranslationResult]:
|
|
49
|
+
"""Translate a list of words from source to target language.
|
|
50
|
+
|
|
51
|
+
Returns one TranslationResult per input word, in the same order.
|
|
52
|
+
Returns empty list for empty input (no API call).
|
|
53
|
+
"""
|
|
54
|
+
if not words:
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
word_text = "\n".join(words)
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
response = await self._chain.ainvoke({"text": [HumanMessage(content=word_text)]})
|
|
61
|
+
result = _TranslationBatch(
|
|
62
|
+
**response.tool_calls[0]["args"] # type: ignore[attr-defined]
|
|
63
|
+
)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
raise APIError(str(e)) from e
|
|
66
|
+
|
|
67
|
+
return result.translations
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nl_processing
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Natural language processing playground
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: pydantic<3,>=2.0
|
|
8
|
+
Requires-Dist: langchain<1,>=0.3
|
|
9
|
+
Requires-Dist: langchain-openai<1,>=0.3
|
|
10
|
+
Requires-Dist: opencv-python<5,>=4.10
|
|
11
|
+
|
|
12
|
+
# nl_processing
|
|
13
|
+
|
|
14
|
+
Minimal NLP processing project scaffolded from `proto_tg_bot` structure.
|
|
15
|
+
|
|
16
|
+
## Directory Structure
|
|
17
|
+
|
|
18
|
+
| Directory | Description |
|
|
19
|
+
|-------------------|-------------------------------------|
|
|
20
|
+
| `nl_processing/` | Main package source code |
|
|
21
|
+
| `tests/` | Unit, integration, and e2e tests |
|
|
22
|
+
| `scripts/` | Utility/dev scripts |
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
nl_processing/__init__.py
|
|
4
|
+
nl_processing.egg-info/PKG-INFO
|
|
5
|
+
nl_processing.egg-info/SOURCES.txt
|
|
6
|
+
nl_processing.egg-info/dependency_links.txt
|
|
7
|
+
nl_processing.egg-info/requires.txt
|
|
8
|
+
nl_processing.egg-info/top_level.txt
|
|
9
|
+
nl_processing/core/__init__.py
|
|
10
|
+
nl_processing/core/exceptions.py
|
|
11
|
+
nl_processing/core/models.py
|
|
12
|
+
nl_processing/core/prompts.py
|
|
13
|
+
nl_processing/core/scripts/prompt_author.py
|
|
14
|
+
nl_processing/database/__init__.py
|
|
15
|
+
nl_processing/database/service.py
|
|
16
|
+
nl_processing/extract_text_from_image/__init__.py
|
|
17
|
+
nl_processing/extract_text_from_image/benchmark.py
|
|
18
|
+
nl_processing/extract_text_from_image/image_encoding.py
|
|
19
|
+
nl_processing/extract_text_from_image/service.py
|
|
20
|
+
nl_processing/extract_text_from_image/prompts/generate_nl_prompt.py
|
|
21
|
+
nl_processing/extract_words_from_text/__init__.py
|
|
22
|
+
nl_processing/extract_words_from_text/service.py
|
|
23
|
+
nl_processing/extract_words_from_text/prompts/generate_nl_prompt.py
|
|
24
|
+
nl_processing/translate_text/__init__.py
|
|
25
|
+
nl_processing/translate_text/service.py
|
|
26
|
+
nl_processing/translate_text/prompts/generate_nl_ru_prompt.py
|
|
27
|
+
nl_processing/translate_word/__init__.py
|
|
28
|
+
nl_processing/translate_word/service.py
|
|
29
|
+
nl_processing/translate_word/prompts/generate_nl_ru_prompt.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
nl_processing
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "nl_processing"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Natural language processing playground"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"pydantic>=2.0,<3",
|
|
13
|
+
"langchain>=0.3,<1",
|
|
14
|
+
"langchain-openai>=0.3,<1",
|
|
15
|
+
"opencv-python>=4.10,<5",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[tool.setuptools.packages.find]
|
|
19
|
+
where = ["."]
|
|
20
|
+
include = ["nl_processing*"]
|
|
21
|
+
exclude = ["tests*"]
|
|
22
|
+
|
|
23
|
+
[dependency-groups]
|
|
24
|
+
dev = [
|
|
25
|
+
"pytest>=9.0.2,<10",
|
|
26
|
+
"pytest-asyncio>=1.3.0,<2",
|
|
27
|
+
"pytest-xdist>=3.8.0,<4",
|
|
28
|
+
"ruff>=0.15.0,<0.16",
|
|
29
|
+
"pylint>=4.0.4,<5",
|
|
30
|
+
"vulture>=2.14.0,<3",
|
|
31
|
+
]
|