hamtaa-texttools 1.1.22__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hamtaa_texttools-1.1.22/hamtaa_texttools.egg-info → hamtaa_texttools-1.2.0}/PKG-INFO +11 -25
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0/hamtaa_texttools.egg-info}/PKG-INFO +11 -25
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/hamtaa_texttools.egg-info/SOURCES.txt +13 -12
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/hamtaa_texttools.egg-info/requires.txt +1 -1
- hamtaa_texttools-1.2.0/pyproject.toml +45 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/tests/test_all_async_tools.py +2 -5
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/tests/test_all_tools.py +3 -10
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/tests/test_output_validation.py +2 -6
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/__init__.py +4 -4
- hamtaa_texttools-1.2.0/texttools/batch/__init__.py +0 -0
- hamtaa_texttools-1.1.22/texttools/batch/batch_config.py → hamtaa_texttools-1.2.0/texttools/batch/config.py +3 -2
- hamtaa_texttools-1.1.22/texttools/batch/batch_manager.py → hamtaa_texttools-1.2.0/texttools/batch/manager.py +8 -8
- hamtaa_texttools-1.1.22/texttools/batch/batch_runner.py → hamtaa_texttools-1.2.0/texttools/batch/runner.py +10 -10
- hamtaa_texttools-1.2.0/texttools/core/__init__.py +0 -0
- hamtaa_texttools-1.2.0/texttools/core/engine.py +254 -0
- {hamtaa_texttools-1.1.22/texttools/internals → hamtaa_texttools-1.2.0/texttools/core}/exceptions.py +0 -6
- hamtaa_texttools-1.2.0/texttools/core/internal_models.py +58 -0
- {hamtaa_texttools-1.1.22/texttools/internals → hamtaa_texttools-1.2.0/texttools/core/operators}/async_operator.py +12 -21
- {hamtaa_texttools-1.1.22/texttools/internals → hamtaa_texttools-1.2.0/texttools/core/operators}/sync_operator.py +12 -21
- hamtaa_texttools-1.2.0/texttools/models.py +88 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/prompts/categorize.yaml +3 -2
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/prompts/check_fact.yaml +5 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/prompts/extract_entities.yaml +4 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/prompts/extract_keywords.yaml +15 -3
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/prompts/is_question.yaml +4 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/prompts/merge_questions.yaml +8 -1
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/prompts/propositionize.yaml +2 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/prompts/rewrite.yaml +3 -4
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/prompts/subject_to_question.yaml +5 -1
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/prompts/summarize.yaml +4 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/prompts/text_to_question.yaml +4 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/prompts/translate.yaml +5 -0
- hamtaa_texttools-1.2.0/texttools/py.typed +0 -0
- hamtaa_texttools-1.2.0/texttools/tools/__init__.py +0 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/tools/async_tools.py +183 -194
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/tools/sync_tools.py +183 -195
- hamtaa_texttools-1.1.22/MANIFEST.in +0 -2
- hamtaa_texttools-1.1.22/pyproject.toml +0 -34
- hamtaa_texttools-1.1.22/texttools/internals/models.py +0 -150
- hamtaa_texttools-1.1.22/texttools/internals/operator_utils.py +0 -76
- hamtaa_texttools-1.1.22/texttools/internals/prompt_loader.py +0 -91
- hamtaa_texttools-1.1.22/texttools/internals/text_to_chunks.py +0 -97
- hamtaa_texttools-1.1.22/texttools/prompts/README.md +0 -35
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/LICENSE +0 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/README.md +0 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/hamtaa_texttools.egg-info/dependency_links.txt +0 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/hamtaa_texttools.egg-info/top_level.txt +0 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/setup.cfg +0 -0
- {hamtaa_texttools-1.1.22 → hamtaa_texttools-1.2.0}/texttools/prompts/run_custom.yaml +0 -0
|
@@ -1,34 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hamtaa-texttools
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: A high-level NLP toolkit built on top of modern LLMs.
|
|
5
|
-
Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>,
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
Copyright (c) 2025 Hamtaa
|
|
9
|
-
|
|
10
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
-
in the Software without restriction, including without limitation the rights
|
|
13
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
-
furnished to do so, subject to the following conditions:
|
|
16
|
-
|
|
17
|
-
The above copyright notice and this permission notice shall be included in all
|
|
18
|
-
copies or substantial portions of the Software.
|
|
19
|
-
|
|
20
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
-
SOFTWARE.
|
|
5
|
+
Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Erfan Moosavi <erfanmoosavi84@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
|
|
6
|
+
Maintainer-email: Erfan Moosavi <erfanmoosavi84@gmail.com>, Tohidi <the.mohammad.tohidi@gmail.com>
|
|
7
|
+
License: MIT
|
|
27
8
|
Keywords: nlp,llm,text-processing,openai
|
|
28
|
-
|
|
9
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
+
Classifier: Topic :: Text Processing
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Requires-Python: >=3.9
|
|
29
15
|
Description-Content-Type: text/markdown
|
|
30
16
|
License-File: LICENSE
|
|
31
|
-
Requires-Dist: openai
|
|
17
|
+
Requires-Dist: openai>=1.97.1
|
|
32
18
|
Requires-Dist: pydantic>=2.0.0
|
|
33
19
|
Requires-Dist: pyyaml>=6.0
|
|
34
20
|
Dynamic: license-file
|
|
@@ -1,34 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hamtaa-texttools
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: A high-level NLP toolkit built on top of modern LLMs.
|
|
5
|
-
Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>,
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
Copyright (c) 2025 Hamtaa
|
|
9
|
-
|
|
10
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
-
in the Software without restriction, including without limitation the rights
|
|
13
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
-
furnished to do so, subject to the following conditions:
|
|
16
|
-
|
|
17
|
-
The above copyright notice and this permission notice shall be included in all
|
|
18
|
-
copies or substantial portions of the Software.
|
|
19
|
-
|
|
20
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
-
SOFTWARE.
|
|
5
|
+
Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Erfan Moosavi <erfanmoosavi84@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
|
|
6
|
+
Maintainer-email: Erfan Moosavi <erfanmoosavi84@gmail.com>, Tohidi <the.mohammad.tohidi@gmail.com>
|
|
7
|
+
License: MIT
|
|
27
8
|
Keywords: nlp,llm,text-processing,openai
|
|
28
|
-
|
|
9
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
+
Classifier: Topic :: Text Processing
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Requires-Python: >=3.9
|
|
29
15
|
Description-Content-Type: text/markdown
|
|
30
16
|
License-File: LICENSE
|
|
31
|
-
Requires-Dist: openai
|
|
17
|
+
Requires-Dist: openai>=1.97.1
|
|
32
18
|
Requires-Dist: pydantic>=2.0.0
|
|
33
19
|
Requires-Dist: pyyaml>=6.0
|
|
34
20
|
Dynamic: license-file
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
LICENSE
|
|
2
|
-
MANIFEST.in
|
|
3
2
|
README.md
|
|
4
3
|
pyproject.toml
|
|
5
4
|
hamtaa_texttools.egg-info/PKG-INFO
|
|
@@ -11,17 +10,18 @@ tests/test_all_async_tools.py
|
|
|
11
10
|
tests/test_all_tools.py
|
|
12
11
|
tests/test_output_validation.py
|
|
13
12
|
texttools/__init__.py
|
|
14
|
-
texttools/
|
|
15
|
-
texttools/
|
|
16
|
-
texttools/batch/
|
|
17
|
-
texttools/
|
|
18
|
-
texttools/
|
|
19
|
-
texttools/
|
|
20
|
-
texttools/
|
|
21
|
-
texttools/
|
|
22
|
-
texttools/
|
|
23
|
-
texttools/
|
|
24
|
-
texttools/
|
|
13
|
+
texttools/models.py
|
|
14
|
+
texttools/py.typed
|
|
15
|
+
texttools/batch/__init__.py
|
|
16
|
+
texttools/batch/config.py
|
|
17
|
+
texttools/batch/manager.py
|
|
18
|
+
texttools/batch/runner.py
|
|
19
|
+
texttools/core/__init__.py
|
|
20
|
+
texttools/core/engine.py
|
|
21
|
+
texttools/core/exceptions.py
|
|
22
|
+
texttools/core/internal_models.py
|
|
23
|
+
texttools/core/operators/async_operator.py
|
|
24
|
+
texttools/core/operators/sync_operator.py
|
|
25
25
|
texttools/prompts/categorize.yaml
|
|
26
26
|
texttools/prompts/check_fact.yaml
|
|
27
27
|
texttools/prompts/extract_entities.yaml
|
|
@@ -35,5 +35,6 @@ texttools/prompts/subject_to_question.yaml
|
|
|
35
35
|
texttools/prompts/summarize.yaml
|
|
36
36
|
texttools/prompts/text_to_question.yaml
|
|
37
37
|
texttools/prompts/translate.yaml
|
|
38
|
+
texttools/tools/__init__.py
|
|
38
39
|
texttools/tools/async_tools.py
|
|
39
40
|
texttools/tools/sync_tools.py
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "hamtaa-texttools"
|
|
7
|
+
version = "1.2.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "Tohidi", email = "the.mohammad.tohidi@gmail.com"},
|
|
10
|
+
{name = "Erfan Moosavi", email = "erfanmoosavi84@gmail.com"},
|
|
11
|
+
{name = "Montazer", email = "montazerh82@gmail.com"},
|
|
12
|
+
{name = "Givechi", email = "mohamad.m.givechi@gmail.com"},
|
|
13
|
+
{name = "Zareshahi", email = "a.zareshahi1377@gmail.com"},
|
|
14
|
+
]
|
|
15
|
+
maintainers = [
|
|
16
|
+
{name = "Erfan Moosavi", email = "erfanmoosavi84@gmail.com"},
|
|
17
|
+
{name = "Tohidi", email = "the.mohammad.tohidi@gmail.com"},
|
|
18
|
+
]
|
|
19
|
+
description = "A high-level NLP toolkit built on top of modern LLMs."
|
|
20
|
+
readme = "README.md"
|
|
21
|
+
license = {text = "MIT"}
|
|
22
|
+
requires-python = ">=3.9"
|
|
23
|
+
dependencies = [
|
|
24
|
+
"openai>=1.97.1",
|
|
25
|
+
"pydantic>=2.0.0",
|
|
26
|
+
"pyyaml>=6.0",
|
|
27
|
+
]
|
|
28
|
+
keywords = ["nlp", "llm", "text-processing", "openai"]
|
|
29
|
+
classifiers = [
|
|
30
|
+
"Development Status :: 5 - Production/Stable",
|
|
31
|
+
"License :: OSI Approved :: MIT License",
|
|
32
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
33
|
+
"Topic :: Text Processing",
|
|
34
|
+
"Operating System :: OS Independent",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[tool.setuptools.packages.find]
|
|
38
|
+
where = ["."]
|
|
39
|
+
include = ["texttools*"]
|
|
40
|
+
|
|
41
|
+
[tool.setuptools]
|
|
42
|
+
include-package-data = true
|
|
43
|
+
|
|
44
|
+
[tool.setuptools.package-data]
|
|
45
|
+
"texttools" = ["prompts/*.yaml", "py.typed"]
|
|
@@ -6,16 +6,13 @@ from openai import AsyncOpenAI
|
|
|
6
6
|
|
|
7
7
|
from texttools import AsyncTheTool
|
|
8
8
|
|
|
9
|
-
# Load environment variables from .env
|
|
10
9
|
load_dotenv()
|
|
11
|
-
|
|
10
|
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
12
11
|
BASE_URL = os.getenv("BASE_URL")
|
|
13
12
|
MODEL = os.getenv("MODEL")
|
|
14
13
|
|
|
15
|
-
|
|
16
|
-
client = AsyncOpenAI(base_url=BASE_URL, api_key=API_KEY)
|
|
14
|
+
client = AsyncOpenAI(base_url=BASE_URL, api_key=OPENAI_API_KEY)
|
|
17
15
|
|
|
18
|
-
# Create an instance of TheTool
|
|
19
16
|
t = AsyncTheTool(client=client, model=MODEL)
|
|
20
17
|
|
|
21
18
|
|
|
@@ -6,24 +6,18 @@ from pydantic import BaseModel
|
|
|
6
6
|
|
|
7
7
|
from texttools import TheTool, CategoryTree
|
|
8
8
|
|
|
9
|
-
# Load environment variables from .env
|
|
10
9
|
load_dotenv()
|
|
11
|
-
|
|
10
|
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
12
11
|
BASE_URL = os.getenv("BASE_URL")
|
|
13
12
|
MODEL = os.getenv("MODEL")
|
|
14
13
|
|
|
15
|
-
|
|
16
|
-
client = OpenAI(base_url=BASE_URL, api_key=API_KEY)
|
|
14
|
+
client = OpenAI(base_url=BASE_URL, api_key=OPENAI_API_KEY)
|
|
17
15
|
|
|
18
|
-
# Create an instance of TheTool
|
|
19
16
|
t = TheTool(client=client, model=MODEL)
|
|
20
17
|
|
|
21
18
|
# Categorizer: list mode
|
|
22
19
|
category = t.categorize(
|
|
23
|
-
"سلام حالت چطوره؟",
|
|
24
|
-
categories=["هیچکدام", "دینی", "فلسفه"],
|
|
25
|
-
logprobs=True,
|
|
26
|
-
top_logprobs=3,
|
|
20
|
+
"سلام حالت چطوره؟", categories=["هیچکدام", "دینی", "فلسفه"], priority=3
|
|
27
21
|
)
|
|
28
22
|
print(repr(category))
|
|
29
23
|
|
|
@@ -56,7 +50,6 @@ entities = t.extract_entities(
|
|
|
56
50
|
"Ali will be dead by the car crash",
|
|
57
51
|
entities=["EVENT"],
|
|
58
52
|
with_analysis=True,
|
|
59
|
-
logprobs=True,
|
|
60
53
|
)
|
|
61
54
|
print(repr(entities))
|
|
62
55
|
|
|
@@ -6,16 +6,13 @@ from openai import OpenAI
|
|
|
6
6
|
|
|
7
7
|
from texttools import TheTool
|
|
8
8
|
|
|
9
|
-
# Load environment variables from .env
|
|
10
9
|
load_dotenv()
|
|
11
|
-
|
|
10
|
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
12
11
|
BASE_URL = os.getenv("BASE_URL")
|
|
13
12
|
MODEL = os.getenv("MODEL")
|
|
14
13
|
|
|
15
|
-
|
|
16
|
-
client = OpenAI(base_url=BASE_URL, api_key=API_KEY)
|
|
14
|
+
client = OpenAI(base_url=BASE_URL, api_key=OPENAI_API_KEY)
|
|
17
15
|
|
|
18
|
-
# Create an instance of TheTool
|
|
19
16
|
t = TheTool(client=client, model=MODEL)
|
|
20
17
|
|
|
21
18
|
|
|
@@ -24,7 +21,6 @@ def validate(result: Any) -> bool:
|
|
|
24
21
|
return "چیست؟" not in result
|
|
25
22
|
|
|
26
23
|
|
|
27
|
-
# Question from Text Generator
|
|
28
24
|
question = t.text_to_question(
|
|
29
25
|
"زندگی",
|
|
30
26
|
output_lang="Persian",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from .
|
|
1
|
+
from .batch.config import BatchConfig
|
|
2
|
+
from .batch.runner import BatchRunner
|
|
3
|
+
from .models import CategoryTree
|
|
2
4
|
from .tools.async_tools import AsyncTheTool
|
|
3
|
-
from .
|
|
4
|
-
from .batch.batch_runner import BatchRunner
|
|
5
|
-
from .batch.batch_config import BatchConfig
|
|
5
|
+
from .tools.sync_tools import TheTool
|
|
6
6
|
|
|
7
7
|
__all__ = ["TheTool", "AsyncTheTool", "CategoryTree", "BatchRunner", "BatchConfig"]
|
|
File without changes
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
1
|
from collections.abc import Callable
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
def export_data(data) -> list[dict[str, str]]:
|
|
@@ -10,7 +11,7 @@ def export_data(data) -> list[dict[str, str]]:
|
|
|
10
11
|
return data
|
|
11
12
|
|
|
12
13
|
|
|
13
|
-
def import_data(data) ->
|
|
14
|
+
def import_data(data) -> Any:
|
|
14
15
|
"""
|
|
15
16
|
Takes the output and adds and aggregates it to the original structure.
|
|
16
17
|
"""
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import logging
|
|
2
3
|
import uuid
|
|
3
4
|
from pathlib import Path
|
|
4
|
-
from typing import Type, TypeVar
|
|
5
|
-
import logging
|
|
5
|
+
from typing import Any, Type, TypeVar
|
|
6
6
|
|
|
7
|
-
from pydantic import BaseModel
|
|
8
7
|
from openai import OpenAI
|
|
9
8
|
from openai.lib._pydantic import to_strict_json_schema
|
|
9
|
+
from pydantic import BaseModel
|
|
10
10
|
|
|
11
11
|
# Base Model type for output models
|
|
12
12
|
T = TypeVar("T", bound=BaseModel)
|
|
@@ -31,7 +31,7 @@ class BatchManager:
|
|
|
31
31
|
prompt_template: str,
|
|
32
32
|
state_dir: Path = Path(".batch_jobs"),
|
|
33
33
|
custom_json_schema_obj_str: dict | None = None,
|
|
34
|
-
**client_kwargs:
|
|
34
|
+
**client_kwargs: Any,
|
|
35
35
|
):
|
|
36
36
|
self._client = client
|
|
37
37
|
self._model = model
|
|
@@ -51,7 +51,7 @@ class BatchManager:
|
|
|
51
51
|
def _state_file(self, job_name: str) -> Path:
|
|
52
52
|
return self._state_dir / f"{job_name}.json"
|
|
53
53
|
|
|
54
|
-
def _load_state(self, job_name: str) -> list[dict[str,
|
|
54
|
+
def _load_state(self, job_name: str) -> list[dict[str, Any]]:
|
|
55
55
|
"""
|
|
56
56
|
Loads the state (job information) from the state file for the given job name.
|
|
57
57
|
Returns an empty list if the state file does not exist.
|
|
@@ -62,7 +62,7 @@ class BatchManager:
|
|
|
62
62
|
return json.load(f)
|
|
63
63
|
return []
|
|
64
64
|
|
|
65
|
-
def _save_state(self, job_name: str, jobs: list[dict[str,
|
|
65
|
+
def _save_state(self, job_name: str, jobs: list[dict[str, Any]]) -> None:
|
|
66
66
|
"""
|
|
67
67
|
Saves the job state to the state file for the given job name.
|
|
68
68
|
"""
|
|
@@ -77,11 +77,11 @@ class BatchManager:
|
|
|
77
77
|
if path.exists():
|
|
78
78
|
path.unlink()
|
|
79
79
|
|
|
80
|
-
def _build_task(self, text: str, idx: str) -> dict[str,
|
|
80
|
+
def _build_task(self, text: str, idx: str) -> dict[str, Any]:
|
|
81
81
|
"""
|
|
82
82
|
Builds a single task dictionary for the batch job, including the prompt, model, and response format configuration.
|
|
83
83
|
"""
|
|
84
|
-
response_format_config: dict[str,
|
|
84
|
+
response_format_config: dict[str, Any]
|
|
85
85
|
|
|
86
86
|
if self._custom_json_schema_obj_str:
|
|
87
87
|
response_format_config = {
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import logging
|
|
2
3
|
import os
|
|
3
4
|
import time
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import Type, TypeVar
|
|
6
|
-
import logging
|
|
6
|
+
from typing import Any, Type, TypeVar
|
|
7
7
|
|
|
8
8
|
from dotenv import load_dotenv
|
|
9
9
|
from openai import OpenAI
|
|
10
10
|
from pydantic import BaseModel
|
|
11
11
|
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
from
|
|
12
|
+
from ..core.exceptions import TextToolsError
|
|
13
|
+
from ..core.internal_models import Str
|
|
14
|
+
from .config import BatchConfig
|
|
15
|
+
from .manager import BatchManager
|
|
16
16
|
|
|
17
17
|
# Base Model type for output models
|
|
18
18
|
T = TypeVar("T", bound=BaseModel)
|
|
@@ -38,7 +38,7 @@ class BatchRunner:
|
|
|
38
38
|
self._output_model = output_model
|
|
39
39
|
self._manager = self._init_manager()
|
|
40
40
|
self._data = self._load_data()
|
|
41
|
-
self._parts: list[list[dict[str,
|
|
41
|
+
self._parts: list[list[dict[str, Any]]] = []
|
|
42
42
|
# Map part index to job name
|
|
43
43
|
self._part_idx_to_job_name: dict[int, str] = {}
|
|
44
44
|
# Track retry attempts per part
|
|
@@ -47,7 +47,7 @@ class BatchRunner:
|
|
|
47
47
|
Path(self._config.BASE_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
|
48
48
|
|
|
49
49
|
except Exception as e:
|
|
50
|
-
raise
|
|
50
|
+
raise TextToolsError(f"Batch runner initialization failed: {e}")
|
|
51
51
|
|
|
52
52
|
def _init_manager(self) -> BatchManager:
|
|
53
53
|
load_dotenv()
|
|
@@ -130,8 +130,8 @@ class BatchRunner:
|
|
|
130
130
|
|
|
131
131
|
def _save_results(
|
|
132
132
|
self,
|
|
133
|
-
output_data: list[dict[str,
|
|
134
|
-
log: list[
|
|
133
|
+
output_data: list[dict[str, Any]] | dict[str, Any],
|
|
134
|
+
log: list[Any],
|
|
135
135
|
part_idx: int,
|
|
136
136
|
):
|
|
137
137
|
part_suffix = f"_part_{part_idx + 1}" if len(self._parts) > 1 else ""
|
|
File without changes
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import random
|
|
3
|
+
import re
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from .exceptions import PromptError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PromptLoader:
|
|
13
|
+
"""
|
|
14
|
+
Utility for loading and formatting YAML prompt templates.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
MAIN_TEMPLATE = "main_template"
|
|
18
|
+
ANALYZE_TEMPLATE = "analyze_template"
|
|
19
|
+
|
|
20
|
+
@lru_cache(maxsize=32)
|
|
21
|
+
def _load_templates(self, prompt_file: str, mode: str | None) -> dict[str, str]:
|
|
22
|
+
"""
|
|
23
|
+
Loads prompt templates from YAML file with optional mode selection.
|
|
24
|
+
"""
|
|
25
|
+
try:
|
|
26
|
+
base_dir = Path(__file__).parent.parent / Path("prompts")
|
|
27
|
+
prompt_path = base_dir / prompt_file
|
|
28
|
+
|
|
29
|
+
if not prompt_path.exists():
|
|
30
|
+
raise PromptError(f"Prompt file not found: {prompt_file}")
|
|
31
|
+
|
|
32
|
+
data = yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
|
|
33
|
+
|
|
34
|
+
if self.MAIN_TEMPLATE not in data:
|
|
35
|
+
raise PromptError(f"Missing 'main_template' in {prompt_file}")
|
|
36
|
+
|
|
37
|
+
if self.ANALYZE_TEMPLATE not in data:
|
|
38
|
+
raise PromptError(f"Missing 'analyze_template' in {prompt_file}")
|
|
39
|
+
|
|
40
|
+
if mode and mode not in data.get(self.MAIN_TEMPLATE, {}):
|
|
41
|
+
raise PromptError(f"Mode '{mode}' not found in {prompt_file}")
|
|
42
|
+
|
|
43
|
+
main_template = (
|
|
44
|
+
data[self.MAIN_TEMPLATE][mode]
|
|
45
|
+
if mode and isinstance(data[self.MAIN_TEMPLATE], dict)
|
|
46
|
+
else data[self.MAIN_TEMPLATE]
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
analyze_template = (
|
|
50
|
+
data[self.ANALYZE_TEMPLATE][mode]
|
|
51
|
+
if mode and isinstance(data[self.ANALYZE_TEMPLATE], dict)
|
|
52
|
+
else data[self.ANALYZE_TEMPLATE]
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
if not main_template or not main_template.strip():
|
|
56
|
+
raise PromptError(
|
|
57
|
+
f"Empty main_template in {prompt_file}"
|
|
58
|
+
+ (f" for mode '{mode}'" if mode else "")
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
self.MAIN_TEMPLATE: main_template,
|
|
63
|
+
self.ANALYZE_TEMPLATE: analyze_template,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
except yaml.YAMLError as e:
|
|
67
|
+
raise PromptError(f"Invalid YAML in {prompt_file}: {e}")
|
|
68
|
+
except Exception as e:
|
|
69
|
+
raise PromptError(f"Failed to load prompt {prompt_file}: {e}")
|
|
70
|
+
|
|
71
|
+
def load(
|
|
72
|
+
self, prompt_file: str, text: str, mode: str, **extra_kwargs
|
|
73
|
+
) -> dict[str, str]:
|
|
74
|
+
try:
|
|
75
|
+
template_configs = self._load_templates(prompt_file, mode)
|
|
76
|
+
format_args = {"text": text}
|
|
77
|
+
format_args.update(extra_kwargs)
|
|
78
|
+
|
|
79
|
+
# Inject variables inside each template
|
|
80
|
+
for key in template_configs.keys():
|
|
81
|
+
template_configs[key] = template_configs[key].format(**format_args)
|
|
82
|
+
|
|
83
|
+
return template_configs
|
|
84
|
+
|
|
85
|
+
except KeyError as e:
|
|
86
|
+
raise PromptError(f"Missing template variable: {e}")
|
|
87
|
+
except Exception as e:
|
|
88
|
+
raise PromptError(f"Failed to format prompt: {e}")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class OperatorUtils:
|
|
92
|
+
@staticmethod
|
|
93
|
+
def build_main_prompt(
|
|
94
|
+
main_template: str,
|
|
95
|
+
analysis: str | None,
|
|
96
|
+
output_lang: str | None,
|
|
97
|
+
user_prompt: str | None,
|
|
98
|
+
) -> str:
|
|
99
|
+
main_prompt = ""
|
|
100
|
+
|
|
101
|
+
if analysis:
|
|
102
|
+
main_prompt += f"Based on this analysis:\n{analysis}\n"
|
|
103
|
+
|
|
104
|
+
if output_lang:
|
|
105
|
+
main_prompt += f"Respond only in the {output_lang} language.\n"
|
|
106
|
+
|
|
107
|
+
if user_prompt:
|
|
108
|
+
main_prompt += f"Consider this instruction {user_prompt}\n"
|
|
109
|
+
|
|
110
|
+
main_prompt += main_template
|
|
111
|
+
|
|
112
|
+
return main_prompt
|
|
113
|
+
|
|
114
|
+
@staticmethod
|
|
115
|
+
def build_message(prompt: str) -> list[dict[str, str]]:
|
|
116
|
+
return [{"role": "user", "content": prompt}]
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def extract_logprobs(completion: dict) -> list[dict]:
|
|
120
|
+
"""
|
|
121
|
+
Extracts and filters token probabilities from completion logprobs.
|
|
122
|
+
Skips punctuation and structural tokens, returns cleaned probability data.
|
|
123
|
+
"""
|
|
124
|
+
logprobs_data = []
|
|
125
|
+
|
|
126
|
+
ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
|
|
127
|
+
|
|
128
|
+
for choice in completion.choices:
|
|
129
|
+
if not getattr(choice, "logprobs", None):
|
|
130
|
+
raise ValueError("Your model does not support logprobs")
|
|
131
|
+
|
|
132
|
+
for logprob_item in choice.logprobs.content:
|
|
133
|
+
if ignore_pattern.match(logprob_item.token):
|
|
134
|
+
continue
|
|
135
|
+
token_entry = {
|
|
136
|
+
"token": logprob_item.token,
|
|
137
|
+
"prob": round(math.exp(logprob_item.logprob), 8),
|
|
138
|
+
"top_alternatives": [],
|
|
139
|
+
}
|
|
140
|
+
for alt in logprob_item.top_logprobs:
|
|
141
|
+
if ignore_pattern.match(alt.token):
|
|
142
|
+
continue
|
|
143
|
+
token_entry["top_alternatives"].append(
|
|
144
|
+
{
|
|
145
|
+
"token": alt.token,
|
|
146
|
+
"prob": round(math.exp(alt.logprob), 8),
|
|
147
|
+
}
|
|
148
|
+
)
|
|
149
|
+
logprobs_data.append(token_entry)
|
|
150
|
+
|
|
151
|
+
return logprobs_data
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def get_retry_temp(base_temp: float) -> float:
|
|
155
|
+
delta_temp = random.choice([-1, 1]) * random.uniform(0.1, 0.9)
|
|
156
|
+
new_temp = base_temp + delta_temp
|
|
157
|
+
|
|
158
|
+
return max(0.0, min(new_temp, 1.5))
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def text_to_chunks(text: str, size: int, overlap: int) -> list[str]:
|
|
162
|
+
separators = ["\n\n", "\n", " ", ""]
|
|
163
|
+
is_separator_regex = False
|
|
164
|
+
keep_separator = True # Equivalent to 'start'
|
|
165
|
+
length_function = len
|
|
166
|
+
strip_whitespace = True
|
|
167
|
+
chunk_size = size
|
|
168
|
+
chunk_overlap = overlap
|
|
169
|
+
|
|
170
|
+
def _split_text_with_regex(
|
|
171
|
+
text: str, separator: str, keep_separator: bool
|
|
172
|
+
) -> list[str]:
|
|
173
|
+
if not separator:
|
|
174
|
+
return [text]
|
|
175
|
+
if not keep_separator:
|
|
176
|
+
return re.split(separator, text)
|
|
177
|
+
_splits = re.split(f"({separator})", text)
|
|
178
|
+
splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
|
|
179
|
+
if len(_splits) % 2 == 0:
|
|
180
|
+
splits += [_splits[-1]]
|
|
181
|
+
return [_splits[0]] + splits if _splits[0] else splits
|
|
182
|
+
|
|
183
|
+
def _join_docs(docs: list[str], separator: str) -> str | None:
|
|
184
|
+
text = separator.join(docs)
|
|
185
|
+
if strip_whitespace:
|
|
186
|
+
text = text.strip()
|
|
187
|
+
return text if text else None
|
|
188
|
+
|
|
189
|
+
def _merge_splits(splits: list[str], separator: str) -> list[str]:
|
|
190
|
+
separator_len = length_function(separator)
|
|
191
|
+
docs = []
|
|
192
|
+
current_doc = []
|
|
193
|
+
total = 0
|
|
194
|
+
for d in splits:
|
|
195
|
+
len_ = length_function(d)
|
|
196
|
+
if total + len_ + (separator_len if current_doc else 0) > chunk_size:
|
|
197
|
+
if total > chunk_size:
|
|
198
|
+
pass
|
|
199
|
+
if current_doc:
|
|
200
|
+
doc = _join_docs(current_doc, separator)
|
|
201
|
+
if doc is not None:
|
|
202
|
+
docs.append(doc)
|
|
203
|
+
while total > chunk_overlap or (
|
|
204
|
+
total + len_ + (separator_len if current_doc else 0)
|
|
205
|
+
> chunk_size
|
|
206
|
+
and total > 0
|
|
207
|
+
):
|
|
208
|
+
total -= length_function(current_doc[0]) + (
|
|
209
|
+
separator_len if len(current_doc) > 1 else 0
|
|
210
|
+
)
|
|
211
|
+
current_doc = current_doc[1:]
|
|
212
|
+
current_doc.append(d)
|
|
213
|
+
total += len_ + (separator_len if len(current_doc) > 1 else 0)
|
|
214
|
+
doc = _join_docs(current_doc, separator)
|
|
215
|
+
if doc is not None:
|
|
216
|
+
docs.append(doc)
|
|
217
|
+
return docs
|
|
218
|
+
|
|
219
|
+
def _split_text(text: str, separators: list[str]) -> list[str]:
|
|
220
|
+
final_chunks = []
|
|
221
|
+
separator = separators[-1]
|
|
222
|
+
new_separators = []
|
|
223
|
+
for i, _s in enumerate(separators):
|
|
224
|
+
separator_ = _s if is_separator_regex else re.escape(_s)
|
|
225
|
+
if not _s:
|
|
226
|
+
separator = _s
|
|
227
|
+
break
|
|
228
|
+
if re.search(separator_, text):
|
|
229
|
+
separator = _s
|
|
230
|
+
new_separators = separators[i + 1 :]
|
|
231
|
+
break
|
|
232
|
+
separator_ = separator if is_separator_regex else re.escape(separator)
|
|
233
|
+
splits = _split_text_with_regex(text, separator_, keep_separator)
|
|
234
|
+
_separator = "" if keep_separator else separator
|
|
235
|
+
good_splits = []
|
|
236
|
+
for s in splits:
|
|
237
|
+
if length_function(s) < chunk_size:
|
|
238
|
+
good_splits.append(s)
|
|
239
|
+
else:
|
|
240
|
+
if good_splits:
|
|
241
|
+
merged_text = _merge_splits(good_splits, _separator)
|
|
242
|
+
final_chunks.extend(merged_text)
|
|
243
|
+
good_splits = []
|
|
244
|
+
if not new_separators:
|
|
245
|
+
final_chunks.append(s)
|
|
246
|
+
else:
|
|
247
|
+
other_info = _split_text(s, new_separators)
|
|
248
|
+
final_chunks.extend(other_info)
|
|
249
|
+
if good_splits:
|
|
250
|
+
merged_text = _merge_splits(good_splits, _separator)
|
|
251
|
+
final_chunks.extend(merged_text)
|
|
252
|
+
return final_chunks
|
|
253
|
+
|
|
254
|
+
return _split_text(text, separators)
|