nl-processing 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. nl_processing-0.1.0/PKG-INFO +22 -0
  2. nl_processing-0.1.0/README.md +11 -0
  3. nl_processing-0.1.0/nl_processing/__init__.py +0 -0
  4. nl_processing-0.1.0/nl_processing/core/__init__.py +0 -0
  5. nl_processing-0.1.0/nl_processing/core/exceptions.py +10 -0
  6. nl_processing-0.1.0/nl_processing/core/models.py +21 -0
  7. nl_processing-0.1.0/nl_processing/core/prompts.py +98 -0
  8. nl_processing-0.1.0/nl_processing/core/scripts/prompt_author.py +59 -0
  9. nl_processing-0.1.0/nl_processing/database/__init__.py +0 -0
  10. nl_processing-0.1.0/nl_processing/database/service.py +2 -0
  11. nl_processing-0.1.0/nl_processing/extract_text_from_image/__init__.py +1 -0
  12. nl_processing-0.1.0/nl_processing/extract_text_from_image/benchmark.py +57 -0
  13. nl_processing-0.1.0/nl_processing/extract_text_from_image/image_encoding.py +64 -0
  14. nl_processing-0.1.0/nl_processing/extract_text_from_image/prompts/generate_nl_prompt.py +159 -0
  15. nl_processing-0.1.0/nl_processing/extract_text_from_image/service.py +83 -0
  16. nl_processing-0.1.0/nl_processing/extract_words_from_text/__init__.py +0 -0
  17. nl_processing-0.1.0/nl_processing/extract_words_from_text/prompts/generate_nl_prompt.py +122 -0
  18. nl_processing-0.1.0/nl_processing/extract_words_from_text/service.py +54 -0
  19. nl_processing-0.1.0/nl_processing/translate_text/__init__.py +0 -0
  20. nl_processing-0.1.0/nl_processing/translate_text/prompts/generate_nl_ru_prompt.py +88 -0
  21. nl_processing-0.1.0/nl_processing/translate_text/service.py +58 -0
  22. nl_processing-0.1.0/nl_processing/translate_word/__init__.py +0 -0
  23. nl_processing-0.1.0/nl_processing/translate_word/prompts/generate_nl_ru_prompt.py +112 -0
  24. nl_processing-0.1.0/nl_processing/translate_word/service.py +67 -0
  25. nl_processing-0.1.0/nl_processing.egg-info/PKG-INFO +22 -0
  26. nl_processing-0.1.0/nl_processing.egg-info/SOURCES.txt +29 -0
  27. nl_processing-0.1.0/nl_processing.egg-info/dependency_links.txt +1 -0
  28. nl_processing-0.1.0/nl_processing.egg-info/requires.txt +4 -0
  29. nl_processing-0.1.0/nl_processing.egg-info/top_level.txt +1 -0
  30. nl_processing-0.1.0/pyproject.toml +31 -0
  31. nl_processing-0.1.0/setup.cfg +4 -0
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.4
2
+ Name: nl_processing
3
+ Version: 0.1.0
4
+ Summary: Natural language processing playground
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: pydantic<3,>=2.0
8
+ Requires-Dist: langchain<1,>=0.3
9
+ Requires-Dist: langchain-openai<1,>=0.3
10
+ Requires-Dist: opencv-python<5,>=4.10
11
+
12
+ # nl_processing
13
+
14
+ Minimal NLP processing project scaffolded from `proto_tg_bot` structure.
15
+
16
+ ## Directory Structure
17
+
18
+ | Directory | Description |
19
+ |-------------------|-------------------------------------|
20
+ | `nl_processing/` | Main package source code |
21
+ | `tests/` | Unit, integration, and e2e tests |
22
+ | `scripts/` | Utility/dev scripts |
@@ -0,0 +1,11 @@
1
+ # nl_processing
2
+
3
+ Minimal NLP processing project scaffolded from `proto_tg_bot` structure.
4
+
5
+ ## Directory Structure
6
+
7
+ | Directory | Description |
8
+ |-------------------|-------------------------------------|
9
+ | `nl_processing/` | Main package source code |
10
+ | `tests/` | Unit, integration, and e2e tests |
11
+ | `scripts/` | Utility/dev scripts |
File without changes
File without changes
@@ -0,0 +1,10 @@
1
+ class APIError(Exception):
2
+ """Wraps upstream OpenAI/LangChain API failures."""
3
+
4
+
5
+ class TargetLanguageNotFoundError(Exception):
6
+ """Raised when no text in the target language is detected."""
7
+
8
+
9
+ class UnsupportedImageFormatError(Exception):
10
+ """Raised when the image format is not supported by the OpenAI API."""
@@ -0,0 +1,21 @@
1
+ from enum import Enum
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class Language(Enum):
7
+ NL = "nl"
8
+ RU = "ru"
9
+
10
+
11
+ class ExtractedText(BaseModel):
12
+ text: str
13
+
14
+
15
+ class WordEntry(BaseModel):
16
+ normalized_form: str
17
+ word_type: str
18
+
19
+
20
+ class TranslationResult(BaseModel):
21
+ translation: str
@@ -0,0 +1,98 @@
1
+ import json
2
+ import pathlib
3
+
4
+ from langchain_core.load import load
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_core.runnables import RunnableSerializable
7
+ from langchain_openai import ChatOpenAI
8
+ from pydantic import BaseModel
9
+
10
+ from nl_processing.core.models import Language
11
+
12
+
13
+ def build_translation_chain(
14
+ *,
15
+ source_language: Language,
16
+ target_language: Language,
17
+ supported_pairs: set[tuple[str, str]],
18
+ prompts_dir: pathlib.Path,
19
+ tool_schema: type[BaseModel],
20
+ model: str,
21
+ ) -> RunnableSerializable: # type: ignore[type-arg]
22
+ """Validate a language pair, load its prompt, and return a prompt|llm chain.
23
+
24
+ This is shared infrastructure for translation-style services that follow the
25
+ pattern: validate pair → load JSON prompt → bind_tools → compose chain.
26
+
27
+ Args:
28
+ source_language: Source language enum value.
29
+ target_language: Target language enum value.
30
+ supported_pairs: Set of (src, tgt) value strings that are allowed.
31
+ prompts_dir: Directory containing ``<src>_<tgt>.json`` prompt files.
32
+ tool_schema: Pydantic model class to bind as a tool.
33
+ model: OpenAI model identifier string.
34
+
35
+ Returns:
36
+ A ``prompt | llm`` RunnableSerializable ready for ``ainvoke()``.
37
+
38
+ Raises:
39
+ ValueError: If the language pair is not in *supported_pairs*.
40
+ """
41
+ pair = (source_language.value, target_language.value)
42
+ if pair not in supported_pairs:
43
+ msg = (
44
+ f"Unsupported language pair: "
45
+ f"{source_language.value} -> {target_language.value}. "
46
+ f"Supported pairs: {supported_pairs}"
47
+ )
48
+ raise ValueError(msg)
49
+
50
+ prompt_file = f"{source_language.value}_{target_language.value}.json"
51
+ prompt = load_prompt(str(prompts_dir / prompt_file))
52
+
53
+ llm = ChatOpenAI(model=model, temperature=0).bind_tools(
54
+ [tool_schema],
55
+ tool_choice=tool_schema.__name__,
56
+ )
57
+ return prompt | llm
58
+
59
+
60
+ def load_prompt(prompt_path: str) -> ChatPromptTemplate:
61
+ """Load a ChatPromptTemplate from a LangChain-serialized JSON file.
62
+
63
+ The JSON file must contain the output of ``langchain_core.load.dumpd(prompt)``.
64
+
65
+ Args:
66
+ prompt_path: Path to the prompt JSON file in LangChain native format.
67
+
68
+ Returns:
69
+ A ChatPromptTemplate ready for chain composition.
70
+
71
+ Raises:
72
+ FileNotFoundError: If the prompt file does not exist.
73
+ ValueError: If the JSON file is malformed or cannot be deserialized.
74
+ TypeError: If the file content is not a JSON object or the deserialized
75
+ object is not a ChatPromptTemplate.
76
+ """
77
+ path = pathlib.Path(prompt_path)
78
+ if not path.exists():
79
+ raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
80
+
81
+ try:
82
+ with path.open("r", encoding="utf-8") as f:
83
+ data = json.load(f)
84
+ except json.JSONDecodeError as e:
85
+ raise ValueError(f"Invalid JSON in prompt file {prompt_path}: {e}") from e
86
+
87
+ if not isinstance(data, dict):
88
+ raise TypeError(f"Prompt file must contain a JSON object, got {type(data).__name__}")
89
+
90
+ try:
91
+ prompt = load(data)
92
+ except Exception as e:
93
+ raise ValueError(f"Failed to deserialize ChatPromptTemplate from {prompt_path}: {e}") from e
94
+
95
+ if not isinstance(prompt, ChatPromptTemplate):
96
+ raise TypeError(f"Expected ChatPromptTemplate, got {type(prompt).__name__} from {prompt_path}")
97
+
98
+ return prompt
@@ -0,0 +1,59 @@
1
+ """Prompt authoring helper — serialize ChatPromptTemplate to JSON.
2
+
3
+ Usage:
4
+ 1. Edit the ``build_prompt()`` function below to define your prompt.
5
+ 2. Set OUTPUT_PATH to your desired output file path.
6
+ 3. Run: uv run python nl_processing/core/scripts/prompt_author.py
7
+
8
+ The output JSON can be loaded by ``nl_processing.core.prompts.load_prompt()``.
9
+ """
10
+
11
+ import json
12
+
13
+ from langchain_core.load import dumpd
14
+ from langchain_core.prompts import ChatPromptTemplate
15
+
16
+
17
+ def build_prompt() -> ChatPromptTemplate:
18
+ """Define your prompt here. Edit this function for each prompt you author."""
19
+ return ChatPromptTemplate.from_messages([
20
+ ("system", "You are a helpful assistant. Respond in {language}."),
21
+ ("human", "{input}"),
22
+ ])
23
+
24
+
25
+ def serialize_prompt_to_json(prompt: ChatPromptTemplate, output_path: str) -> None:
26
+ """Serialize a ChatPromptTemplate to JSON using LangChain native format.
27
+
28
+ Args:
29
+ prompt: The ChatPromptTemplate to serialize.
30
+ output_path: Path where to save the JSON file.
31
+ """
32
+ data = dumpd(prompt)
33
+ with open(output_path, "w", encoding="utf-8") as f:
34
+ json.dump(data, f, indent=2)
35
+ f.write("\n")
36
+
37
+
38
+ def save_prompt(prompt: ChatPromptTemplate, output_path: str) -> None:
39
+ """Serialize a prompt to JSON and print a summary to stdout.
40
+
41
+ Intended for use in ``if __name__ == "__main__"`` blocks of prompt
42
+ generation scripts so each script stays DRY.
43
+
44
+ Args:
45
+ prompt: The ChatPromptTemplate to serialize.
46
+ output_path: Path where to save the JSON file.
47
+ """
48
+ serialize_prompt_to_json(prompt, output_path)
49
+ print(f"Prompt saved to {output_path}") # noqa: T201
50
+ print(f"Messages: {len(prompt.messages)}") # noqa: T201
51
+ print(f"Input variables: {prompt.input_variables}") # noqa: T201
52
+
53
+
54
+ OUTPUT_PATH = "output_prompt.json"
55
+
56
+
57
+ if __name__ == "__main__":
58
+ _prompt = build_prompt()
59
+ save_prompt(_prompt, OUTPUT_PATH)
File without changes
@@ -0,0 +1,2 @@
1
+ def save_translation(storage: dict[str, str], source_text: str, translated_text: str) -> None:
2
+ storage[source_text] = translated_text
@@ -0,0 +1,57 @@
1
+ import re
2
+
3
+ import cv2
4
+ import numpy
5
+
6
+
7
+ def generate_test_image(
8
+ text: str,
9
+ output_path: str,
10
+ *,
11
+ width: int = 800,
12
+ height: int = 200,
13
+ font_scale: float = 1.0,
14
+ thickness: int = 2,
15
+ ) -> str:
16
+ """Generate a synthetic test image with known text rendered on it.
17
+
18
+ Returns the output file path.
19
+ """
20
+ img = numpy.zeros((height, width, 3), dtype=numpy.uint8)
21
+ img.fill(255) # white background
22
+
23
+ lines = text.split("\n")
24
+ y_offset = 40
25
+ line_height = int(40 * font_scale)
26
+
27
+ for line in lines:
28
+ cv2.putText(
29
+ img,
30
+ line,
31
+ (20, y_offset),
32
+ cv2.FONT_HERSHEY_SIMPLEX,
33
+ font_scale,
34
+ (0, 0, 0),
35
+ thickness,
36
+ )
37
+ y_offset += line_height
38
+
39
+ success = cv2.imwrite(output_path, img)
40
+ if not success:
41
+ raise ValueError(f"Failed to write image to {output_path}")
42
+ return output_path
43
+
44
+
45
+ def normalize_text(text: str) -> str:
46
+ """Normalize text for comparison: strip whitespace, line breaks, markdown formatting."""
47
+ normalized = re.sub(r"[#*_~`>\-]+", "", text) # remove markdown chars
48
+ normalized = re.sub(r"\s+", " ", normalized) # collapse whitespace
49
+ return normalized.strip().lower()
50
+
51
+
52
+ def evaluate_extraction(extracted: str, ground_truth: str) -> bool:
53
+ """Compare extracted text against ground truth after normalization.
54
+
55
+ Returns True if exact match after normalization.
56
+ """
57
+ return normalize_text(extracted) == normalize_text(ground_truth)
@@ -0,0 +1,64 @@
1
+ import base64
2
+ import pathlib
3
+
4
+ import cv2
5
+ import numpy
6
+
7
+ from nl_processing.core.exceptions import UnsupportedImageFormatError
8
+
9
+ SUPPORTED_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp"}
10
+
11
+
12
+ def get_image_format(path: str) -> str:
13
+ """Return the file extension (lowercase) for the given image path."""
14
+ return pathlib.Path(path).suffix.lower()
15
+
16
+
17
+ def validate_image_format(path: str) -> None:
18
+ """Validate that the image format is supported by OpenAI Vision API.
19
+
20
+ Raises:
21
+ UnsupportedImageFormatError: If the file extension is not in SUPPORTED_EXTENSIONS.
22
+ """
23
+ suffix = get_image_format(path)
24
+ if suffix not in SUPPORTED_EXTENSIONS:
25
+ msg = f"Unsupported image format '{suffix}'. Supported formats: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
26
+ raise UnsupportedImageFormatError(msg)
27
+
28
+
29
+ def encode_path_to_base64(path: str) -> tuple[str, str]:
30
+ """Read an image file and return (base64_string, media_type).
31
+
32
+ Does NOT validate format — caller is responsible for validation.
33
+ """
34
+ suffix = get_image_format(path)
35
+ media_type = _suffix_to_media_type(suffix)
36
+ with open(path, "rb") as f:
37
+ image_bytes = f.read()
38
+ base64_string = base64.b64encode(image_bytes).decode("utf-8")
39
+ return base64_string, media_type
40
+
41
+
42
+ def encode_cv2_to_base64(image: numpy.ndarray) -> tuple[str, str]:
43
+ """Encode an OpenCV image array to base64 PNG.
44
+
45
+ Returns (base64_string, media_type).
46
+ """
47
+ success, buffer = cv2.imencode(".png", image)
48
+ if not success:
49
+ msg = "Failed to encode image to PNG"
50
+ raise ValueError(msg)
51
+ base64_string = base64.b64encode(buffer.tobytes()).decode("utf-8")
52
+ return base64_string, "image/png"
53
+
54
+
55
+ def _suffix_to_media_type(suffix: str) -> str:
56
+ """Convert file extension to MIME media type."""
57
+ mapping = {
58
+ ".png": "image/png",
59
+ ".jpg": "image/jpeg",
60
+ ".jpeg": "image/jpeg",
61
+ ".gif": "image/gif",
62
+ ".webp": "image/webp",
63
+ }
64
+ return mapping[suffix]
@@ -0,0 +1,159 @@
1
+ """Generate the Dutch extraction prompt (nl.json) with few-shot examples.
2
+
3
+ Usage:
4
+ uv run python nl_processing/extract_text_from_image/prompts/generate_nl_prompt.py
5
+
6
+ This script:
7
+ 1. Generates synthetic test images and encodes real photos
8
+ 2. Encodes them to base64
9
+ 3. Builds a ChatPromptTemplate with 5 few-shot examples (HumanMessage + AIMessage + ToolMessage triplets)
10
+ 4. Serializes with dumpd() and saves to nl.json
11
+
12
+ The script is the source of truth — nl.json is the generated artifact.
13
+ Re-run this script whenever example text or image parameters change.
14
+ """
15
+
16
+ from pathlib import Path
17
+ import tempfile
18
+
19
+ from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
20
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
21
+
22
+ from nl_processing.extract_text_from_image.benchmark import generate_test_image
23
+ from nl_processing.extract_text_from_image.image_encoding import encode_path_to_base64
24
+
25
+ SYSTEM_INSTRUCTION = (
26
+ "Je bent een tekst-extractie assistent. "
27
+ "Extraheer alleen de Nederlandse tekst uit het aangeboden beeld. "
28
+ "Behoud de originele documentstructuur als markdown "
29
+ "(koppen, nadruk, regelafbrekingen). "
30
+ "Negeer tekst in andere talen. "
31
+ "Retourneer alleen de geëxtraheerde tekst, zonder commentaar of uitleg."
32
+ )
33
+
34
+ EXAMPLE_1_TEXT = "De kat zit op de mat"
35
+ EXAMPLE_1_EXPECTED = "De kat zit op de mat"
36
+
37
+ EXAMPLE_2_TEXT = "Welkom bij ons\nДобро пожаловать"
38
+ EXAMPLE_2_EXPECTED = "Welkom bij ons"
39
+
40
+ EXAMPLE_3_IMAGE = Path(__file__).parent / "examples" / "dutch_handwritten_mixed.jpg"
41
+ EXAMPLE_3_EXPECTED = (
42
+ "getal, het\n"
43
+ "getrouwd\n"
44
+ "niet\n"
45
+ "nieuw\n"
46
+ "mooi\n"
47
+ "hoog\n"
48
+ "baan\n"
49
+ "kunst\n"
50
+ "heel\n"
51
+ "leren kennen\n"
52
+ "eeuw\n"
53
+ "moe\n"
54
+ "vroeg\n"
55
+ "ver\n"
56
+ "daar\n"
57
+ "tijd\n"
58
+ "lezen"
59
+ )
60
+
61
+ EXAMPLE_4_IMAGE = Path(__file__).parent / "examples" / "dutch_vocabulary_wide.jpg"
62
+ EXAMPLE_4_EXPECTED = (
63
+ "vandaan\n"
64
+ "veranderen\n"
65
+ "verbeteren\n"
66
+ "vlakbij\n"
67
+ "volgorde, de\n"
68
+ "voorbeeld, het\n"
69
+ "voornaam, de\n"
70
+ "vorm, de\n"
71
+ "vraag, de\n"
72
+ "vriendin, de\n"
73
+ "vrouw, de\n"
74
+ "wat\n"
75
+ "week, de\n"
76
+ "welkom\n"
77
+ "werken\n"
78
+ "wonen\n"
79
+ "woonplaats, de\n"
80
+ "woord, het\n"
81
+ "ze\n"
82
+ "zeggen\n"
83
+ "zij\n"
84
+ "zijn\n"
85
+ "zijn\n"
86
+ "zin, de"
87
+ )
88
+
89
+ EXAMPLE_5_TEXT = "The quick brown fox jumps over the lazy dog"
90
+ EXAMPLE_5_EXPECTED = ""
91
+
92
+ OUTPUT_PATH = Path(__file__).parent / "nl.json"
93
+
94
+
95
+ def _generate_image_b64(text: str, *, width: int = 800, height: int = 200) -> str:
96
+ """Generate a synthetic image and return its base64 data URL."""
97
+ with tempfile.TemporaryDirectory() as tmpdir:
98
+ img_path = str(Path(tmpdir) / "image.png")
99
+ generate_test_image(text, img_path, width=width, height=height, font_scale=1.2)
100
+ b64, media_type = encode_path_to_base64(img_path)
101
+ return f"data:{media_type};base64,{b64}"
102
+
103
+
104
+ def _encode_existing_image_b64(path: Path) -> str:
105
+ """Encode an existing image file and return its base64 data URL."""
106
+ b64, media_type = encode_path_to_base64(str(path))
107
+ return f"data:{media_type};base64,{b64}"
108
+
109
+
110
+ def _make_example_human(image_data_url: str) -> HumanMessage:
111
+ """Create a HumanMessage with an image content block."""
112
+ return HumanMessage(
113
+ content=[
114
+ {"type": "image_url", "image_url": {"url": image_data_url}},
115
+ ]
116
+ )
117
+
118
+
119
+ def _make_example_ai(expected_text: str, call_id: str) -> AIMessage:
120
+ """Create an AIMessage with a tool_call for ExtractedText."""
121
+ return AIMessage(
122
+ content="",
123
+ tool_calls=[{"name": "ExtractedText", "args": {"text": expected_text}, "id": call_id}],
124
+ )
125
+
126
+
127
+ def build_prompt() -> ChatPromptTemplate:
128
+ """Build the Dutch extraction prompt with 5 few-shot examples."""
129
+ img1 = _generate_image_b64(EXAMPLE_1_TEXT)
130
+ img2 = _generate_image_b64(EXAMPLE_2_TEXT)
131
+ img3 = _encode_existing_image_b64(EXAMPLE_3_IMAGE)
132
+ img4 = _encode_existing_image_b64(EXAMPLE_4_IMAGE)
133
+ img5 = _generate_image_b64(EXAMPLE_5_TEXT)
134
+
135
+ return ChatPromptTemplate.from_messages([
136
+ SystemMessage(content=SYSTEM_INSTRUCTION),
137
+ _make_example_human(img1),
138
+ _make_example_ai(EXAMPLE_1_EXPECTED, "call_example_1"),
139
+ ToolMessage(content=EXAMPLE_1_EXPECTED, tool_call_id="call_example_1"),
140
+ _make_example_human(img2),
141
+ _make_example_ai(EXAMPLE_2_EXPECTED, "call_example_2"),
142
+ ToolMessage(content=EXAMPLE_2_EXPECTED, tool_call_id="call_example_2"),
143
+ _make_example_human(img3),
144
+ _make_example_ai(EXAMPLE_3_EXPECTED, "call_example_3"),
145
+ ToolMessage(content=EXAMPLE_3_EXPECTED, tool_call_id="call_example_3"),
146
+ _make_example_human(img4),
147
+ _make_example_ai(EXAMPLE_4_EXPECTED, "call_example_4"),
148
+ ToolMessage(content=EXAMPLE_4_EXPECTED, tool_call_id="call_example_4"),
149
+ _make_example_human(img5),
150
+ _make_example_ai(EXAMPLE_5_EXPECTED, "call_example_5"),
151
+ ToolMessage(content=EXAMPLE_5_EXPECTED, tool_call_id="call_example_5"),
152
+ MessagesPlaceholder(variable_name="images"),
153
+ ])
154
+
155
+
156
+ if __name__ == "__main__":
157
+ from nl_processing.core.scripts.prompt_author import save_prompt
158
+
159
+ save_prompt(build_prompt(), str(OUTPUT_PATH))
@@ -0,0 +1,83 @@
1
+ import pathlib
2
+
3
+ from langchain_core.messages import HumanMessage
4
+ from langchain_openai import ChatOpenAI
5
+ import numpy
6
+
7
+ from nl_processing.core.exceptions import APIError, TargetLanguageNotFoundError
8
+ from nl_processing.core.models import ExtractedText, Language
9
+ from nl_processing.core.prompts import load_prompt
10
+ from nl_processing.extract_text_from_image.image_encoding import (
11
+ encode_cv2_to_base64,
12
+ encode_path_to_base64,
13
+ validate_image_format,
14
+ )
15
+
16
+ # Resolve prompts directory relative to this file
17
+ _PROMPTS_DIR = pathlib.Path(__file__).parent / "prompts"
18
+
19
+
20
+ class ImageTextExtractor:
21
+ """Extract language-specific text from images using OpenAI Vision API.
22
+
23
+ Usage:
24
+ extractor = ImageTextExtractor()
25
+ text = await extractor.extract_from_path("image.png")
26
+ text = await extractor.extract_from_cv2(cv2_image)
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ *,
32
+ language: Language = Language.NL,
33
+ model: str = "gpt-4.1-mini",
34
+ reasoning_effort: str | None = None,
35
+ service_tier: str | None = None,
36
+ temperature: float | None = 0,
37
+ ) -> None:
38
+ self._language = language
39
+ prompt_path = str(_PROMPTS_DIR / f"{language.value}.json")
40
+ prompt = load_prompt(prompt_path)
41
+
42
+ llm = ChatOpenAI(
43
+ model=model, service_tier=service_tier, reasoning_effort=reasoning_effort, temperature=temperature
44
+ ).bind_tools([ExtractedText], tool_choice=ExtractedText.__name__)
45
+
46
+ self._chain = prompt | llm
47
+
48
+ async def extract_from_path(self, path: str) -> str:
49
+ """Extract text from image at the given file path.
50
+
51
+ Returns markdown-formatted text in the target language.
52
+ """
53
+ validate_image_format(path)
54
+ base64_string, media_type = encode_path_to_base64(path)
55
+ return await self._aextract(base64_string, media_type)
56
+
57
+ async def extract_from_cv2(self, image: "numpy.ndarray") -> str:
58
+ """Extract text from OpenCV image array.
59
+
60
+ Returns markdown-formatted text in the target language.
61
+ """
62
+ base64_string, media_type = encode_cv2_to_base64(image)
63
+ return await self._aextract(base64_string, media_type)
64
+
65
+ async def _aextract(self, base64_string: str, media_type: str) -> str:
66
+ """Internal: run the extraction chain with the base64 image."""
67
+ human_message = HumanMessage(
68
+ content=[
69
+ {"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{base64_string}"}},
70
+ ]
71
+ )
72
+ try:
73
+ response = await self._chain.ainvoke({"images": [human_message]})
74
+ result = ExtractedText(**response.tool_calls[0]["args"]) # type: ignore[attr-defined]
75
+ except Exception as e:
76
+ raise APIError(str(e)) from e
77
+
78
+ # Check if target language text was found
79
+ if not result.text.strip():
80
+ msg = "No text in the target language was found in the image"
81
+ raise TargetLanguageNotFoundError(msg)
82
+
83
+ return result.text
@@ -0,0 +1,122 @@
1
+ """Generate the Dutch word extraction prompt (nl.json) with few-shot examples.
2
+
3
+ Usage:
4
+ uv run python nl_processing/extract_words_from_text/prompts/generate_nl_prompt.py
5
+
6
+ The script is the source of truth -- nl.json is the generated artifact.
7
+ Re-run this script whenever prompt content changes.
8
+ """
9
+
10
+ import json
11
+ from pathlib import Path
12
+
13
+ from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
14
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
15
+
16
+ SYSTEM_INSTRUCTION = (
17
+ "Je bent een woord-extractie assistent voor de Nederlandse taal. "
18
+ "Je taak is om alle Nederlandse woorden uit de aangeboden tekst te extraheren en te normaliseren.\n\n"
19
+ "Regels:\n"
20
+ "- Negeer alle markdown-opmaak (koppen, vet, cursief, lijsten) -- extraheer alleen taalkundige inhoud.\n"
21
+ "- Negeer tekst in andere talen dan Nederlands.\n"
22
+ "- Normaliseer elk woord volgens de Nederlandse regels:\n"
23
+ " - Zelfstandige naamwoorden: met lidwoord (de/het), bijv. 'de fiets', 'het huis'\n"
24
+ " - Werkwoorden: infinitief, bijv. 'lopen', 'hebben'\n"
25
+ " - Bijvoeglijke naamwoorden: basisvorm, bijv. 'groot', 'klein'\n"
26
+ " - Voorzetsels, voegwoorden, bijwoorden: basisvorm\n"
27
+ " - Eigennamen (personen/merken): ongewijzigd, type 'proper_noun_person'\n"
28
+ " - Eigennamen (landen): ongewijzigd, type 'proper_noun_country'\n"
29
+ "- Extraheer samengestelde uitdrukkingen en fraseologische constructies als enkele eenheden.\n"
30
+ "- Wijs een plat woordtype toe aan elk woord. Mogelijke types: "
31
+ "noun, verb, adjective, adverb, preposition, conjunction, pronoun, article, numeral, "
32
+ "proper_noun_person, proper_noun_country.\n"
33
+ "- Retourneer het resultaat als een lijst van WordEntry objecten in een _WordList wrapper.\n"
34
+ "- Als de tekst geen Nederlandse woorden bevat, retourneer dan een lege lijst."
35
+ )
36
+
37
+ TOOL_NAME = "_WordList"
38
+ OUTPUT_PATH = Path(__file__).parent / "nl.json"
39
+
40
+ _W = dict[str, str]
41
+
42
+
43
+ def _w(form: str, wtype: str) -> _W:
44
+ return {"normalized_form": form, "word_type": wtype}
45
+
46
+
47
+ # fmt: off
48
+ EXAMPLES: list[tuple[str, list[_W]]] = [
49
+ # 1: Simple sentence — noun (de/het), verb, adjective
50
+ ("De grote kat loopt snel.", [
51
+ _w("de kat", "noun"), _w("groot", "adjective"),
52
+ _w("lopen", "verb"), _w("snel", "adverb"),
53
+ ]),
54
+ # 2: Proper nouns and prepositions
55
+ ("Jan woont in Nederland.", [
56
+ _w("Jan", "proper_noun_person"), _w("wonen", "verb"),
57
+ _w("in", "preposition"), _w("Nederland", "proper_noun_country"),
58
+ ]),
59
+ # 3: Compound expression
60
+ ("Zij gaat er vandoor met haar vriend.", [
61
+ _w("zij", "pronoun"), _w("ervandoor gaan", "verb"),
62
+ _w("met", "preposition"), _w("haar", "pronoun"), _w("de vriend", "noun"),
63
+ ]),
64
+ # 4: Non-Dutch text — empty list
65
+ ("The quick brown fox jumps over the lazy dog.", []),
66
+ # 5: Mixed markdown with various word types
67
+ ("# Welkom\n\nHet **kleine** kind speelt vrolijk in de tuin.", [
68
+ _w("welkom", "adjective"), _w("het kind", "noun"), _w("klein", "adjective"),
69
+ _w("spelen", "verb"), _w("vrolijk", "adverb"),
70
+ _w("in", "preposition"), _w("de tuin", "noun"),
71
+ ]),
72
+ # 6: Product packaging prose — brand names, adjectives as base form, plurals singularized
73
+ (
74
+ "Met De Ruijter kunt u elke dag genieten "
75
+ "van een breed assortiment smakelijke producten.\n"
76
+ "Chocoladevlokken Melk en Puur\n"
77
+ "Chocoladehagel Melk en Puur\n"
78
+ "Vruchtenhagel\nAnijshagel\nVlokfeest\n"
79
+ "Gestampte Muisjes\nRose en Witte Muisjes\nBlauwe en Witte Muisjes",
80
+ [
81
+ _w("met", "preposition"), _w("De Ruijter", "proper_noun_person"),
82
+ _w("kunnen", "verb"), _w("u", "pronoun"), _w("elk", "adjective"),
83
+ _w("de dag", "noun"), _w("genieten", "verb"), _w("van", "preposition"),
84
+ _w("een", "article"), _w("breed", "adjective"),
85
+ _w("het assortiment", "noun"), _w("smakelijk", "adjective"),
86
+ _w("het product", "noun"), _w("de chocoladevlokken", "noun"),
87
+ _w("de melk", "noun"), _w("en", "conjunction"), _w("puur", "adjective"),
88
+ _w("de chocoladehagel", "noun"), _w("de vruchtenhagel", "noun"),
89
+ _w("de anijshagel", "noun"), _w("het vlokfeest", "noun"),
90
+ _w("gestampt", "adjective"), _w("het muisje", "noun"),
91
+ _w("roze", "adjective"), _w("wit", "adjective"), _w("blauw", "adjective"),
92
+ ],
93
+ ),
94
+ ]
95
+ # fmt: on
96
+
97
+
98
+ def _make_ai(words: list[_W], call_id: str) -> AIMessage:
99
+ return AIMessage(
100
+ content="",
101
+ tool_calls=[{"name": TOOL_NAME, "args": {"words": words}, "id": call_id}],
102
+ )
103
+
104
+
105
+ def _make_ack(words: list[_W], call_id: str) -> ToolMessage:
106
+ return ToolMessage(content=json.dumps({"words": words}, ensure_ascii=False), tool_call_id=call_id)
107
+
108
+
109
+ def build_prompt() -> ChatPromptTemplate:
110
+ """Build the Dutch word extraction prompt with few-shot examples."""
111
+ messages: list = [SystemMessage(content=SYSTEM_INSTRUCTION)]
112
+ for i, (text, words) in enumerate(EXAMPLES, 1):
113
+ cid = f"call_ex_{i}"
114
+ messages += [HumanMessage(content=text), _make_ai(words, cid), _make_ack(words, cid)]
115
+ messages.append(MessagesPlaceholder(variable_name="text"))
116
+ return ChatPromptTemplate.from_messages(messages)
117
+
118
+
119
+ if __name__ == "__main__":
120
+ from nl_processing.core.scripts.prompt_author import save_prompt
121
+
122
+ save_prompt(build_prompt(), str(OUTPUT_PATH))
@@ -0,0 +1,54 @@
1
+ import pathlib
2
+
3
+ from langchain_core.messages import HumanMessage
4
+ from langchain_openai import ChatOpenAI
5
+ from pydantic import BaseModel
6
+
7
+ from nl_processing.core.exceptions import APIError
8
+ from nl_processing.core.models import Language, WordEntry
9
+ from nl_processing.core.prompts import load_prompt
10
+
11
+ # Resolve prompts directory relative to this file
12
+ _PROMPTS_DIR = pathlib.Path(__file__).parent / "prompts"
13
+
14
+
15
+ class _WordList(BaseModel):
16
+ """Internal wrapper: bind_tools needs a single model, output is a list."""
17
+
18
+ words: list[WordEntry]
19
+
20
+
21
+ class WordExtractor:
22
+ """Extract and normalize words from markdown text.
23
+
24
+ Usage:
25
+ extractor = WordExtractor()
26
+ words = await extractor.extract(text)
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ *,
32
+ language: Language = Language.NL,
33
+ model: str = "gpt-4.1-mini",
34
+ ) -> None:
35
+ self._language = language
36
+ prompt_path = str(_PROMPTS_DIR / f"{language.value}.json")
37
+ prompt = load_prompt(prompt_path)
38
+
39
+ llm = ChatOpenAI(model=model, temperature=0).bind_tools([_WordList], tool_choice=_WordList.__name__)
40
+ self._chain = prompt | llm
41
+
42
+ async def extract(self, text: str) -> list[WordEntry]:
43
+ """Extract and normalize words from the given text.
44
+
45
+ Returns a list of WordEntry objects with normalized forms and types.
46
+ Returns an empty list if no words in the target language are found.
47
+ """
48
+ try:
49
+ response = await self._chain.ainvoke({"text": [HumanMessage(content=text)]})
50
+ result = _WordList(**response.tool_calls[0]["args"]) # type: ignore[attr-defined]
51
+ except Exception as e:
52
+ raise APIError(str(e)) from e
53
+
54
+ return result.words
@@ -0,0 +1,88 @@
1
+ """Generate the Dutch→Russian translation prompt (nl_ru.json) with few-shot examples.
2
+
3
+ Usage:
4
+ uv run python nl_processing/translate_text/prompts/generate_nl_ru_prompt.py
5
+
6
+ This script:
7
+ 1. Defines the system instruction (in Russian) for a professional NL→RU translator
8
+ 2. Builds few-shot examples as HumanMessage + AIMessage + ToolMessage triplets
9
+ 3. Serializes with dumpd() and saves to nl_ru.json
10
+
11
+ The script is the source of truth — nl_ru.json is the generated artifact.
12
+ Re-run this script whenever examples or system instruction change.
13
+ """
14
+
15
+ from pathlib import Path
16
+
17
+ from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
18
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
19
+
20
+ SYSTEM_INSTRUCTION = (
21
+ "Вы — профессиональный переводчик с нидерландского на русский язык. "
22
+ "Переведите предоставленный текст естественно, сохраняя смысл близко к оригиналу. "
23
+ "Сохраняйте всё форматирование markdown (заголовки, жирный, курсив, списки, разрывы абзацев). "
24
+ "Верните только переведённый текст — без комментариев, пояснений и префиксов. "
25
+ "Если ввод пуст или не содержит нидерландского текста, верните пустую строку."
26
+ )
27
+
28
+ TOOL_NAME = "_TranslatedText"
29
+
30
+ EXAMPLE_1_INPUT = "De zon schijnt vandaag."
31
+ EXAMPLE_1_OUTPUT = "Сегодня светит солнце."
32
+
33
+ EXAMPLE_2_INPUT = "# Welkom\n\nDit is een **belangrijk** bericht."
34
+ EXAMPLE_2_OUTPUT = "# Добро пожаловать\n\nЭто **важное** сообщение."
35
+
36
+ EXAMPLE_3_INPUT = "Wat heb je nodig:\n\n- *Melk*\n- *Brood*\n- *Kaas*"
37
+ EXAMPLE_3_OUTPUT = "Что тебе нужно:\n\n- *Молоко*\n- *Хлеб*\n- *Сыр*"
38
+
39
+ EXAMPLE_4_INPUT = ""
40
+ EXAMPLE_4_OUTPUT = ""
41
+
42
+ EXAMPLE_5_INPUT = "The quick brown fox jumps over the lazy dog."
43
+ EXAMPLE_5_OUTPUT = ""
44
+
45
+ OUTPUT_PATH = Path(__file__).parent / "nl_ru.json"
46
+
47
+
48
+ def _make_example_ai(translated_text: str, call_id: str) -> AIMessage:
49
+ """Create an AIMessage with a tool_call for _TranslatedText."""
50
+ return AIMessage(
51
+ content="",
52
+ tool_calls=[{"name": TOOL_NAME, "args": {"text": translated_text}, "id": call_id}],
53
+ )
54
+
55
+
56
+ def build_prompt() -> ChatPromptTemplate:
57
+ """Build the Dutch→Russian translation prompt with 5 few-shot examples."""
58
+ return ChatPromptTemplate.from_messages([
59
+ SystemMessage(content=SYSTEM_INSTRUCTION),
60
+ # Example 1: simple sentence
61
+ HumanMessage(content=EXAMPLE_1_INPUT),
62
+ _make_example_ai(EXAMPLE_1_OUTPUT, "call_example_1"),
63
+ ToolMessage(content=EXAMPLE_1_OUTPUT, tool_call_id="call_example_1"),
64
+ # Example 2: markdown with headings and bold
65
+ HumanMessage(content=EXAMPLE_2_INPUT),
66
+ _make_example_ai(EXAMPLE_2_OUTPUT, "call_example_2"),
67
+ ToolMessage(content=EXAMPLE_2_OUTPUT, tool_call_id="call_example_2"),
68
+ # Example 3: list + italic
69
+ HumanMessage(content=EXAMPLE_3_INPUT),
70
+ _make_example_ai(EXAMPLE_3_OUTPUT, "call_example_3"),
71
+ ToolMessage(content=EXAMPLE_3_OUTPUT, tool_call_id="call_example_3"),
72
+ # Example 4: empty input
73
+ HumanMessage(content=EXAMPLE_4_INPUT),
74
+ _make_example_ai(EXAMPLE_4_OUTPUT, "call_example_4"),
75
+ ToolMessage(content=EXAMPLE_4_OUTPUT, tool_call_id="call_example_4"),
76
+ # Example 5: non-Dutch text
77
+ HumanMessage(content=EXAMPLE_5_INPUT),
78
+ _make_example_ai(EXAMPLE_5_OUTPUT, "call_example_5"),
79
+ ToolMessage(content=EXAMPLE_5_OUTPUT, tool_call_id="call_example_5"),
80
+ # Placeholder for actual input
81
+ MessagesPlaceholder(variable_name="text"),
82
+ ])
83
+
84
+
85
+ if __name__ == "__main__":
86
+ from nl_processing.core.scripts.prompt_author import save_prompt
87
+
88
+ save_prompt(build_prompt(), str(OUTPUT_PATH))
@@ -0,0 +1,58 @@
1
+ import pathlib
2
+
3
+ from langchain_core.messages import HumanMessage
4
+ from pydantic import BaseModel
5
+
6
+ from nl_processing.core.exceptions import APIError
7
+ from nl_processing.core.models import Language
8
+ from nl_processing.core.prompts import build_translation_chain
9
+
10
+ _PROMPTS_DIR = pathlib.Path(__file__).parent / "prompts"
11
+
12
+ _SUPPORTED_PAIRS: set[tuple[str, str]] = {("nl", "ru")}
13
+
14
+
15
+ class _TranslatedText(BaseModel):
16
+ text: str
17
+
18
+
19
+ class TextTranslator:
20
+ """Translate text between languages with markdown preservation.
21
+
22
+ Usage:
23
+ translator = TextTranslator(
24
+ source_language=Language.NL,
25
+ target_language=Language.RU,
26
+ )
27
+ result = await translator.translate(dutch_text)
28
+ """
29
+
30
+ def __init__(self, *, source_language: Language, target_language: Language, model: str = "gpt-4.1-mini") -> None:
31
+ self._source_language = source_language
32
+ self._target_language = target_language
33
+ self._chain = build_translation_chain(
34
+ source_language=source_language,
35
+ target_language=target_language,
36
+ supported_pairs=_SUPPORTED_PAIRS,
37
+ prompts_dir=_PROMPTS_DIR,
38
+ tool_schema=_TranslatedText,
39
+ model=model,
40
+ )
41
+
42
+ async def translate(self, text: str) -> str:
43
+ """Translate text from source to target language.
44
+
45
+ Returns the translated text or empty string for empty/non-source input.
46
+ """
47
+ if not text.strip():
48
+ return ""
49
+
50
+ try:
51
+ response = await self._chain.ainvoke({"text": [HumanMessage(content=text)]})
52
+ result = _TranslatedText(
53
+ **response.tool_calls[0]["args"] # type: ignore[attr-defined]
54
+ )
55
+ except Exception as e:
56
+ raise APIError(str(e)) from e
57
+
58
+ return result.text
@@ -0,0 +1,112 @@
1
+ """Generate the Dutch-to-Russian word translation prompt (nl_ru.json).
2
+
3
+ Usage:
4
+ uv run python nl_processing/translate_word/prompts/generate_nl_ru_prompt.py
5
+
6
+ This script:
7
+ 1. Builds a ChatPromptTemplate with system instruction and 4 few-shot examples
8
+ 2. Demonstrates one-to-one order-preserving word translation
9
+ 3. Serializes with dumpd() and saves to nl_ru.json
10
+
11
+ The script is the source of truth -- nl_ru.json is the generated artifact.
12
+ Re-run this script whenever prompt content changes.
13
+ """
14
+
15
+ import json
16
+ from pathlib import Path
17
+
18
+ from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
19
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
20
+
21
+ SYSTEM_INSTRUCTION = (
22
+ "Вы — профессиональный переводчик с нидерландского языка на русский. "
23
+ "Вы получаете список нидерландских слов или фраз и должны перевести каждое на русский. "
24
+ "Верните ровно один перевод для каждого входного слова, в том же порядке. "
25
+ "Количество переводов в результате должно равняться количеству слов на входе. "
26
+ "Каждый перевод должен быть наиболее распространённым, естественным русским эквивалентом. "
27
+ "Если входной список пуст, верните пустой список."
28
+ )
29
+
30
+ EXAMPLE_1_INPUT = "huis\nlopen\nsnel"
31
+ EXAMPLE_1_OUTPUT = [
32
+ {"translation": "дом"},
33
+ {"translation": "ходить"},
34
+ {"translation": "быстро"},
35
+ ]
36
+
37
+ EXAMPLE_2_INPUT = "de kat\nhet boek\nschrijven\nmooi\nin"
38
+ EXAMPLE_2_OUTPUT = [
39
+ {"translation": "кошка"},
40
+ {"translation": "книга"},
41
+ {"translation": "писать"},
42
+ {"translation": "красивый"},
43
+ {"translation": "в"},
44
+ ]
45
+
46
+ EXAMPLE_3_INPUT = "er vandoor gaan\nde fiets"
47
+ EXAMPLE_3_OUTPUT = [
48
+ {"translation": "сбежать"},
49
+ {"translation": "велосипед"},
50
+ ]
51
+
52
+ EXAMPLE_4_INPUT = ""
53
+ EXAMPLE_4_OUTPUT: list[dict[str, str]] = []
54
+
55
+ OUTPUT_PATH = Path(__file__).parent / "nl_ru.json"
56
+
57
+
58
+ def _make_example_ai(translations: list[dict[str, str]], call_id: str) -> AIMessage:
59
+ """Create an AIMessage with a tool_call for _TranslationBatch."""
60
+ return AIMessage(
61
+ content="",
62
+ tool_calls=[
63
+ {
64
+ "name": "_TranslationBatch",
65
+ "args": {"translations": translations},
66
+ "id": call_id,
67
+ }
68
+ ],
69
+ )
70
+
71
+
72
+ def build_prompt() -> ChatPromptTemplate:
73
+ """Build the Dutch-to-Russian word translation prompt with few-shot examples."""
74
+ return ChatPromptTemplate.from_messages([
75
+ SystemMessage(content=SYSTEM_INSTRUCTION),
76
+ # Example 1: 3 simple words
77
+ HumanMessage(content=EXAMPLE_1_INPUT),
78
+ _make_example_ai(EXAMPLE_1_OUTPUT, "call_example_1"),
79
+ ToolMessage(
80
+ content=json.dumps({"translations": EXAMPLE_1_OUTPUT}, ensure_ascii=False),
81
+ tool_call_id="call_example_1",
82
+ ),
83
+ # Example 2: 5 words with articles
84
+ HumanMessage(content=EXAMPLE_2_INPUT),
85
+ _make_example_ai(EXAMPLE_2_OUTPUT, "call_example_2"),
86
+ ToolMessage(
87
+ content=json.dumps({"translations": EXAMPLE_2_OUTPUT}, ensure_ascii=False),
88
+ tool_call_id="call_example_2",
89
+ ),
90
+ # Example 3: compound expression
91
+ HumanMessage(content=EXAMPLE_3_INPUT),
92
+ _make_example_ai(EXAMPLE_3_OUTPUT, "call_example_3"),
93
+ ToolMessage(
94
+ content=json.dumps({"translations": EXAMPLE_3_OUTPUT}, ensure_ascii=False),
95
+ tool_call_id="call_example_3",
96
+ ),
97
+ # Example 4: empty list
98
+ HumanMessage(content=EXAMPLE_4_INPUT),
99
+ _make_example_ai(EXAMPLE_4_OUTPUT, "call_example_4"),
100
+ ToolMessage(
101
+ content=json.dumps({"translations": EXAMPLE_4_OUTPUT}, ensure_ascii=False),
102
+ tool_call_id="call_example_4",
103
+ ),
104
+ # Placeholder for actual input
105
+ MessagesPlaceholder(variable_name="text"),
106
+ ])
107
+
108
+
109
+ if __name__ == "__main__":
110
+ from nl_processing.core.scripts.prompt_author import save_prompt
111
+
112
+ save_prompt(build_prompt(), str(OUTPUT_PATH))
@@ -0,0 +1,67 @@
1
+ import pathlib
2
+
3
+ from langchain_core.messages import HumanMessage
4
+ from pydantic import BaseModel
5
+
6
+ from nl_processing.core.exceptions import APIError
7
+ from nl_processing.core.models import Language, TranslationResult
8
+ from nl_processing.core.prompts import build_translation_chain
9
+
10
+ _PROMPTS_DIR = pathlib.Path(__file__).parent / "prompts"
11
+
12
+ _SUPPORTED_PAIRS: set[tuple[str, str]] = {("nl", "ru")}
13
+
14
+
15
+ class _TranslationBatch(BaseModel):
16
+ translations: list[TranslationResult]
17
+
18
+
19
+ class WordTranslator:
20
+ """Translate word batches between languages.
21
+
22
+ Usage:
23
+ translator = WordTranslator(
24
+ source_language=Language.NL,
25
+ target_language=Language.RU,
26
+ )
27
+ results = await translator.translate(["huis", "lopen"])
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ *,
33
+ source_language: Language,
34
+ target_language: Language,
35
+ model: str = "gpt-4.1-mini",
36
+ ) -> None:
37
+ self._source_language = source_language
38
+ self._target_language = target_language
39
+ self._chain = build_translation_chain(
40
+ source_language=source_language,
41
+ target_language=target_language,
42
+ supported_pairs=_SUPPORTED_PAIRS,
43
+ prompts_dir=_PROMPTS_DIR,
44
+ tool_schema=_TranslationBatch,
45
+ model=model,
46
+ )
47
+
48
+ async def translate(self, words: list[str]) -> list[TranslationResult]:
49
+ """Translate a list of words from source to target language.
50
+
51
+ Returns one TranslationResult per input word, in the same order.
52
+ Returns empty list for empty input (no API call).
53
+ """
54
+ if not words:
55
+ return []
56
+
57
+ word_text = "\n".join(words)
58
+
59
+ try:
60
+ response = await self._chain.ainvoke({"text": [HumanMessage(content=word_text)]})
61
+ result = _TranslationBatch(
62
+ **response.tool_calls[0]["args"] # type: ignore[attr-defined]
63
+ )
64
+ except Exception as e:
65
+ raise APIError(str(e)) from e
66
+
67
+ return result.translations
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.4
2
+ Name: nl_processing
3
+ Version: 0.1.0
4
+ Summary: Natural language processing playground
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: pydantic<3,>=2.0
8
+ Requires-Dist: langchain<1,>=0.3
9
+ Requires-Dist: langchain-openai<1,>=0.3
10
+ Requires-Dist: opencv-python<5,>=4.10
11
+
12
+ # nl_processing
13
+
14
+ Minimal NLP processing project scaffolded from `proto_tg_bot` structure.
15
+
16
+ ## Directory Structure
17
+
18
+ | Directory | Description |
19
+ |-------------------|-------------------------------------|
20
+ | `nl_processing/` | Main package source code |
21
+ | `tests/` | Unit, integration, and e2e tests |
22
+ | `scripts/` | Utility/dev scripts |
@@ -0,0 +1,29 @@
1
+ README.md
2
+ pyproject.toml
3
+ nl_processing/__init__.py
4
+ nl_processing.egg-info/PKG-INFO
5
+ nl_processing.egg-info/SOURCES.txt
6
+ nl_processing.egg-info/dependency_links.txt
7
+ nl_processing.egg-info/requires.txt
8
+ nl_processing.egg-info/top_level.txt
9
+ nl_processing/core/__init__.py
10
+ nl_processing/core/exceptions.py
11
+ nl_processing/core/models.py
12
+ nl_processing/core/prompts.py
13
+ nl_processing/core/scripts/prompt_author.py
14
+ nl_processing/database/__init__.py
15
+ nl_processing/database/service.py
16
+ nl_processing/extract_text_from_image/__init__.py
17
+ nl_processing/extract_text_from_image/benchmark.py
18
+ nl_processing/extract_text_from_image/image_encoding.py
19
+ nl_processing/extract_text_from_image/service.py
20
+ nl_processing/extract_text_from_image/prompts/generate_nl_prompt.py
21
+ nl_processing/extract_words_from_text/__init__.py
22
+ nl_processing/extract_words_from_text/service.py
23
+ nl_processing/extract_words_from_text/prompts/generate_nl_prompt.py
24
+ nl_processing/translate_text/__init__.py
25
+ nl_processing/translate_text/service.py
26
+ nl_processing/translate_text/prompts/generate_nl_ru_prompt.py
27
+ nl_processing/translate_word/__init__.py
28
+ nl_processing/translate_word/service.py
29
+ nl_processing/translate_word/prompts/generate_nl_ru_prompt.py
@@ -0,0 +1,4 @@
1
+ pydantic<3,>=2.0
2
+ langchain<1,>=0.3
3
+ langchain-openai<1,>=0.3
4
+ opencv-python<5,>=4.10
@@ -0,0 +1 @@
1
+ nl_processing
@@ -0,0 +1,31 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "nl_processing"
7
+ version = "0.1.0"
8
+ description = "Natural language processing playground"
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ dependencies = [
12
+ "pydantic>=2.0,<3",
13
+ "langchain>=0.3,<1",
14
+ "langchain-openai>=0.3,<1",
15
+ "opencv-python>=4.10,<5",
16
+ ]
17
+
18
+ [tool.setuptools.packages.find]
19
+ where = ["."]
20
+ include = ["nl_processing*"]
21
+ exclude = ["tests*"]
22
+
23
+ [dependency-groups]
24
+ dev = [
25
+ "pytest>=9.0.2,<10",
26
+ "pytest-asyncio>=1.3.0,<2",
27
+ "pytest-xdist>=3.8.0,<4",
28
+ "ruff>=0.15.0,<0.16",
29
+ "pylint>=4.0.4,<5",
30
+ "vulture>=2.14.0,<3",
31
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+