hamtaa-texttools 0.1.43__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hamtaa-texttools might be problematic. Click here for more details.
- hamtaa_texttools-0.1.43/PKG-INFO +60 -0
- hamtaa_texttools-0.1.43/README.md +50 -0
- hamtaa_texttools-0.1.43/hamtaa_texttools.egg-info/PKG-INFO +60 -0
- hamtaa_texttools-0.1.43/hamtaa_texttools.egg-info/SOURCES.txt +65 -0
- hamtaa_texttools-0.1.43/hamtaa_texttools.egg-info/dependency_links.txt +1 -0
- hamtaa_texttools-0.1.43/hamtaa_texttools.egg-info/requires.txt +2 -0
- hamtaa_texttools-0.1.43/hamtaa_texttools.egg-info/top_level.txt +1 -0
- hamtaa_texttools-0.1.43/pyproject.toml +24 -0
- hamtaa_texttools-0.1.43/setup.cfg +4 -0
- hamtaa_texttools-0.1.43/setup.py +4 -0
- hamtaa_texttools-0.1.43/tests/test_vllm_output.py +79 -0
- hamtaa_texttools-0.1.43/texttools/__init__.py +26 -0
- hamtaa_texttools-0.1.43/texttools/base/__init__.py +3 -0
- hamtaa_texttools-0.1.43/texttools/base/base_categorizer.py +40 -0
- hamtaa_texttools-0.1.43/texttools/base/base_keyword_extractor.py +35 -0
- hamtaa_texttools-0.1.43/texttools/base/base_ner_extractor.py +61 -0
- hamtaa_texttools-0.1.43/texttools/base/base_question_detector.py +35 -0
- hamtaa_texttools-0.1.43/texttools/base/base_question_generator.py +99 -0
- hamtaa_texttools-0.1.43/texttools/base/base_question_merger.py +59 -0
- hamtaa_texttools-0.1.43/texttools/base/base_question_rewriter.py +61 -0
- hamtaa_texttools-0.1.43/texttools/base/base_router.py +33 -0
- hamtaa_texttools-0.1.43/texttools/base/base_summarizer.py +55 -0
- hamtaa_texttools-0.1.43/texttools/base/base_task_performer.py +53 -0
- hamtaa_texttools-0.1.43/texttools/base/base_translator.py +38 -0
- hamtaa_texttools-0.1.43/texttools/batch_manager/__init__.py +2 -0
- hamtaa_texttools-0.1.43/texttools/batch_manager/batch_manager.py +241 -0
- hamtaa_texttools-0.1.43/texttools/batch_manager/batch_runner.py +207 -0
- hamtaa_texttools-0.1.43/texttools/formatter/__init__.py +1 -0
- hamtaa_texttools-0.1.43/texttools/formatter/base.py +26 -0
- hamtaa_texttools-0.1.43/texttools/formatter/gemma3_formatter.py +51 -0
- hamtaa_texttools-0.1.43/texttools/handlers/__init__.py +6 -0
- hamtaa_texttools-0.1.43/texttools/handlers/categorizer/__init__.py +6 -0
- hamtaa_texttools-0.1.43/texttools/handlers/categorizer/categorizer.py +61 -0
- hamtaa_texttools-0.1.43/texttools/handlers/handlers.py +88 -0
- hamtaa_texttools-0.1.43/texttools/tools/__init__.py +33 -0
- hamtaa_texttools-0.1.43/texttools/tools/categorizer/__init__.py +2 -0
- hamtaa_texttools-0.1.43/texttools/tools/categorizer/encoder_model/__init__.py +1 -0
- hamtaa_texttools-0.1.43/texttools/tools/categorizer/encoder_model/encoder_vectorizer.py +51 -0
- hamtaa_texttools-0.1.43/texttools/tools/categorizer/llm/__init__.py +2 -0
- hamtaa_texttools-0.1.43/texttools/tools/categorizer/llm/gemma_categorizer.py +169 -0
- hamtaa_texttools-0.1.43/texttools/tools/categorizer/llm/openai_categorizer.py +80 -0
- hamtaa_texttools-0.1.43/texttools/tools/keyword_extractor/__init__.py +1 -0
- hamtaa_texttools-0.1.43/texttools/tools/keyword_extractor/gemma_extractor.py +138 -0
- hamtaa_texttools-0.1.43/texttools/tools/merger/__init__.py +2 -0
- hamtaa_texttools-0.1.43/texttools/tools/merger/gemma_question_merger.py +214 -0
- hamtaa_texttools-0.1.43/texttools/tools/ner/__init__.py +1 -0
- hamtaa_texttools-0.1.43/texttools/tools/ner/gemma_ner_extractor.py +157 -0
- hamtaa_texttools-0.1.43/texttools/tools/question_detector/__init__.py +2 -0
- hamtaa_texttools-0.1.43/texttools/tools/question_detector/gemma_detector.py +130 -0
- hamtaa_texttools-0.1.43/texttools/tools/question_detector/llm_detector.py +112 -0
- hamtaa_texttools-0.1.43/texttools/tools/question_generator/__init__.py +1 -0
- hamtaa_texttools-0.1.43/texttools/tools/question_generator/gemma_question_generator.py +198 -0
- hamtaa_texttools-0.1.43/texttools/tools/reranker/__init__.py +3 -0
- hamtaa_texttools-0.1.43/texttools/tools/reranker/reranker.py +137 -0
- hamtaa_texttools-0.1.43/texttools/tools/reranker/scorer.py +216 -0
- hamtaa_texttools-0.1.43/texttools/tools/reranker/sorter.py +278 -0
- hamtaa_texttools-0.1.43/texttools/tools/rewriter/__init__.py +2 -0
- hamtaa_texttools-0.1.43/texttools/tools/rewriter/gemma_question_rewriter.py +213 -0
- hamtaa_texttools-0.1.43/texttools/tools/router/__init__.py +0 -0
- hamtaa_texttools-0.1.43/texttools/tools/router/gemma_router.py +169 -0
- hamtaa_texttools-0.1.43/texttools/tools/subject_to_question/__init__.py +1 -0
- hamtaa_texttools-0.1.43/texttools/tools/subject_to_question/gemma_question_generator.py +224 -0
- hamtaa_texttools-0.1.43/texttools/tools/summarizer/__init__.py +2 -0
- hamtaa_texttools-0.1.43/texttools/tools/summarizer/gemma_summarizer.py +140 -0
- hamtaa_texttools-0.1.43/texttools/tools/summarizer/llm_summerizer.py +108 -0
- hamtaa_texttools-0.1.43/texttools/tools/translator/__init__.py +1 -0
- hamtaa_texttools-0.1.43/texttools/tools/translator/gemma_translator.py +202 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hamtaa-texttools
|
|
3
|
+
Version: 0.1.43
|
|
4
|
+
Summary: A set of high-level NLP tools
|
|
5
|
+
Author: Tohidi, Montazer, Givechi, Mousavinezhad
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: openai==1.97.1
|
|
9
|
+
Requires-Dist: numpy==1.26.4
|
|
10
|
+
|
|
11
|
+
# Text Tools
|
|
12
|
+
|
|
13
|
+
<p align="center">
|
|
14
|
+
<img src="https://img.shields.io/badge/TextTools-Python%20Text%20Processing-black?style=for-the-badge&logo=python&logoColor=white">
|
|
15
|
+
</p>
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
<p align="center">
|
|
19
|
+
<img src="docs/logo.png" alt="Preview" width="300" height="300">
|
|
20
|
+
</p>
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
### How to Install
|
|
24
|
+
|
|
25
|
+
Install the package using:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install -U hamta-texttools
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## What This Library Is *Not*
|
|
35
|
+
|
|
36
|
+
This is **not** a collection of low-level utilities.
|
|
37
|
+
|
|
38
|
+
To clarify: this library **does not** include things like:
|
|
39
|
+
- An standard `regex`
|
|
40
|
+
- Word normalization utilities
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## What This Library *Provides*
|
|
45
|
+
|
|
46
|
+
This is a set of **high-level natural language processing (NLP)** tools.
|
|
47
|
+
|
|
48
|
+
Some of the features include:
|
|
49
|
+
- `question_detector`: Detecting if an incoming text is a question or not
|
|
50
|
+
- `categorizer`: No finetuning need, categorizer
|
|
51
|
+
- ... (Tell me what you want!)
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## When to Use This Library
|
|
56
|
+
|
|
57
|
+
Use `texttools` when:
|
|
58
|
+
- You need to **process large volumes of data using OpenAI’s GPT models** via the BATCH API.
|
|
59
|
+
- You want to treat an **LLM as a function** in Python that outputs structured JSON or Pydantic models.
|
|
60
|
+
- You need to **categorize large datasets** using vector embeddings, efficiently and at scale.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Text Tools
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="https://img.shields.io/badge/TextTools-Python%20Text%20Processing-black?style=for-the-badge&logo=python&logoColor=white">
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
<p align="center">
|
|
9
|
+
<img src="docs/logo.png" alt="Preview" width="300" height="300">
|
|
10
|
+
</p>
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
### How to Install
|
|
14
|
+
|
|
15
|
+
Install the package using:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install -U hamta-texttools
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## What This Library Is *Not*
|
|
25
|
+
|
|
26
|
+
This is **not** a collection of low-level utilities.
|
|
27
|
+
|
|
28
|
+
To clarify: this library **does not** include things like:
|
|
29
|
+
- An standard `regex`
|
|
30
|
+
- Word normalization utilities
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## What This Library *Provides*
|
|
35
|
+
|
|
36
|
+
This is a set of **high-level natural language processing (NLP)** tools.
|
|
37
|
+
|
|
38
|
+
Some of the features include:
|
|
39
|
+
- `question_detector`: Detecting if an incoming text is a question or not
|
|
40
|
+
- `categorizer`: No finetuning need, categorizer
|
|
41
|
+
- ... (Tell me what you want!)
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## When to Use This Library
|
|
46
|
+
|
|
47
|
+
Use `texttools` when:
|
|
48
|
+
- You need to **process large volumes of data using OpenAI’s GPT models** via the BATCH API.
|
|
49
|
+
- You want to treat an **LLM as a function** in Python that outputs structured JSON or Pydantic models.
|
|
50
|
+
- You need to **categorize large datasets** using vector embeddings, efficiently and at scale.
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hamtaa-texttools
|
|
3
|
+
Version: 0.1.43
|
|
4
|
+
Summary: A set of high-level NLP tools
|
|
5
|
+
Author: Tohidi, Montazer, Givechi, Mousavinezhad
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: openai==1.97.1
|
|
9
|
+
Requires-Dist: numpy==1.26.4
|
|
10
|
+
|
|
11
|
+
# Text Tools
|
|
12
|
+
|
|
13
|
+
<p align="center">
|
|
14
|
+
<img src="https://img.shields.io/badge/TextTools-Python%20Text%20Processing-black?style=for-the-badge&logo=python&logoColor=white">
|
|
15
|
+
</p>
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
<p align="center">
|
|
19
|
+
<img src="docs/logo.png" alt="Preview" width="300" height="300">
|
|
20
|
+
</p>
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
### How to Install
|
|
24
|
+
|
|
25
|
+
Install the package using:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install -U hamta-texttools
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## What This Library Is *Not*
|
|
35
|
+
|
|
36
|
+
This is **not** a collection of low-level utilities.
|
|
37
|
+
|
|
38
|
+
To clarify: this library **does not** include things like:
|
|
39
|
+
- An standard `regex`
|
|
40
|
+
- Word normalization utilities
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## What This Library *Provides*
|
|
45
|
+
|
|
46
|
+
This is a set of **high-level natural language processing (NLP)** tools.
|
|
47
|
+
|
|
48
|
+
Some of the features include:
|
|
49
|
+
- `question_detector`: Detecting if an incoming text is a question or not
|
|
50
|
+
- `categorizer`: No finetuning need, categorizer
|
|
51
|
+
- ... (Tell me what you want!)
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## When to Use This Library
|
|
56
|
+
|
|
57
|
+
Use `texttools` when:
|
|
58
|
+
- You need to **process large volumes of data using OpenAI’s GPT models** via the BATCH API.
|
|
59
|
+
- You want to treat an **LLM as a function** in Python that outputs structured JSON or Pydantic models.
|
|
60
|
+
- You need to **categorize large datasets** using vector embeddings, efficiently and at scale.
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
setup.py
|
|
4
|
+
hamtaa_texttools.egg-info/PKG-INFO
|
|
5
|
+
hamtaa_texttools.egg-info/SOURCES.txt
|
|
6
|
+
hamtaa_texttools.egg-info/dependency_links.txt
|
|
7
|
+
hamtaa_texttools.egg-info/requires.txt
|
|
8
|
+
hamtaa_texttools.egg-info/top_level.txt
|
|
9
|
+
tests/test_vllm_output.py
|
|
10
|
+
texttools/__init__.py
|
|
11
|
+
texttools/base/__init__.py
|
|
12
|
+
texttools/base/base_categorizer.py
|
|
13
|
+
texttools/base/base_keyword_extractor.py
|
|
14
|
+
texttools/base/base_ner_extractor.py
|
|
15
|
+
texttools/base/base_question_detector.py
|
|
16
|
+
texttools/base/base_question_generator.py
|
|
17
|
+
texttools/base/base_question_merger.py
|
|
18
|
+
texttools/base/base_question_rewriter.py
|
|
19
|
+
texttools/base/base_router.py
|
|
20
|
+
texttools/base/base_summarizer.py
|
|
21
|
+
texttools/base/base_task_performer.py
|
|
22
|
+
texttools/base/base_translator.py
|
|
23
|
+
texttools/batch_manager/__init__.py
|
|
24
|
+
texttools/batch_manager/batch_manager.py
|
|
25
|
+
texttools/batch_manager/batch_runner.py
|
|
26
|
+
texttools/formatter/__init__.py
|
|
27
|
+
texttools/formatter/base.py
|
|
28
|
+
texttools/formatter/gemma3_formatter.py
|
|
29
|
+
texttools/handlers/__init__.py
|
|
30
|
+
texttools/handlers/handlers.py
|
|
31
|
+
texttools/handlers/categorizer/__init__.py
|
|
32
|
+
texttools/handlers/categorizer/categorizer.py
|
|
33
|
+
texttools/tools/__init__.py
|
|
34
|
+
texttools/tools/categorizer/__init__.py
|
|
35
|
+
texttools/tools/categorizer/encoder_model/__init__.py
|
|
36
|
+
texttools/tools/categorizer/encoder_model/encoder_vectorizer.py
|
|
37
|
+
texttools/tools/categorizer/llm/__init__.py
|
|
38
|
+
texttools/tools/categorizer/llm/gemma_categorizer.py
|
|
39
|
+
texttools/tools/categorizer/llm/openai_categorizer.py
|
|
40
|
+
texttools/tools/keyword_extractor/__init__.py
|
|
41
|
+
texttools/tools/keyword_extractor/gemma_extractor.py
|
|
42
|
+
texttools/tools/merger/__init__.py
|
|
43
|
+
texttools/tools/merger/gemma_question_merger.py
|
|
44
|
+
texttools/tools/ner/__init__.py
|
|
45
|
+
texttools/tools/ner/gemma_ner_extractor.py
|
|
46
|
+
texttools/tools/question_detector/__init__.py
|
|
47
|
+
texttools/tools/question_detector/gemma_detector.py
|
|
48
|
+
texttools/tools/question_detector/llm_detector.py
|
|
49
|
+
texttools/tools/question_generator/__init__.py
|
|
50
|
+
texttools/tools/question_generator/gemma_question_generator.py
|
|
51
|
+
texttools/tools/reranker/__init__.py
|
|
52
|
+
texttools/tools/reranker/reranker.py
|
|
53
|
+
texttools/tools/reranker/scorer.py
|
|
54
|
+
texttools/tools/reranker/sorter.py
|
|
55
|
+
texttools/tools/rewriter/__init__.py
|
|
56
|
+
texttools/tools/rewriter/gemma_question_rewriter.py
|
|
57
|
+
texttools/tools/router/__init__.py
|
|
58
|
+
texttools/tools/router/gemma_router.py
|
|
59
|
+
texttools/tools/subject_to_question/__init__.py
|
|
60
|
+
texttools/tools/subject_to_question/gemma_question_generator.py
|
|
61
|
+
texttools/tools/summarizer/__init__.py
|
|
62
|
+
texttools/tools/summarizer/gemma_summarizer.py
|
|
63
|
+
texttools/tools/summarizer/llm_summerizer.py
|
|
64
|
+
texttools/tools/translator/__init__.py
|
|
65
|
+
texttools/tools/translator/gemma_translator.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
texttools
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "hamtaa-texttools"
|
|
7
|
+
version = "0.1.43"
|
|
8
|
+
description = "A set of high-level NLP tools"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Tohidi" },
|
|
13
|
+
{ name = "Montazer" },
|
|
14
|
+
{ name = "Givechi" },
|
|
15
|
+
{ name = "Mousavinezhad" }
|
|
16
|
+
]
|
|
17
|
+
dependencies = [
|
|
18
|
+
"openai==1.97.1",
|
|
19
|
+
"numpy==1.26.4",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[tool.setuptools.packages.find]
|
|
23
|
+
where = ["."]
|
|
24
|
+
include = ["texttools*"]
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
from openai import OpenAI
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
load_dotenv()
|
|
9
|
+
|
|
10
|
+
client = OpenAI()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Output(BaseModel):
|
|
14
|
+
reason: str
|
|
15
|
+
tag: Literal["Positive", "Negative"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
messages = [
|
|
19
|
+
{
|
|
20
|
+
"role": "user",
|
|
21
|
+
"content": """
|
|
22
|
+
هدف ما طبقه بندی متن هست
|
|
23
|
+
متن رو بخون و ایده اصلی و آنالیزی کوتاه از اون رو ارائه بده
|
|
24
|
+
|
|
25
|
+
بسیار خلاصه باشه خروجی تو
|
|
26
|
+
نهایتا 20 کلمه
|
|
27
|
+
|
|
28
|
+
در نهایت یکی از تگ هارو انتخاب کن
|
|
29
|
+
|
|
30
|
+
متن:
|
|
31
|
+
|
|
32
|
+
امروز میخواهم به خونه برگردم!!
|
|
33
|
+
""",
|
|
34
|
+
}
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run_parse():
|
|
39
|
+
return client.beta.chat.completions.parse(
|
|
40
|
+
model="gemma-3",
|
|
41
|
+
messages=messages,
|
|
42
|
+
response_format=Output,
|
|
43
|
+
extra_body=dict(guided_decoding_backend="auto"),
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def run_json_schema():
|
|
48
|
+
return client.chat.completions.create(
|
|
49
|
+
model="gemma-3",
|
|
50
|
+
messages=messages,
|
|
51
|
+
response_format={
|
|
52
|
+
"type": "json_schema",
|
|
53
|
+
"json_schema": {
|
|
54
|
+
"name": "output-schema",
|
|
55
|
+
"schema": Output.model_json_schema(),
|
|
56
|
+
},
|
|
57
|
+
},
|
|
58
|
+
extra_body=dict(guided_decoding_backend="auto"),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def main():
|
|
63
|
+
# Run parse() and print JSON dict
|
|
64
|
+
parsed_response = run_parse()
|
|
65
|
+
|
|
66
|
+
parsed_response = parsed_response.choices[0].message
|
|
67
|
+
parsed_response = parsed_response.parsed
|
|
68
|
+
|
|
69
|
+
print(parsed_response)
|
|
70
|
+
|
|
71
|
+
# Run json_schema and parse + print JSON dict
|
|
72
|
+
json_schema_response = run_json_schema()
|
|
73
|
+
raw_content = json_schema_response.choices[0].message.content
|
|
74
|
+
content_json = json.loads(raw_content)
|
|
75
|
+
print(json.dumps(content_json, ensure_ascii=False, indent=2))
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if __name__ == "__main__":
|
|
79
|
+
main()
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from texttools.batch_manager import BatchJobRunner, SimpleBatchManager
|
|
2
|
+
from texttools.handlers import (
|
|
3
|
+
NoOpResultHandler,
|
|
4
|
+
PrintResultHandler,
|
|
5
|
+
ResultHandler,
|
|
6
|
+
SaveToFileResultHandler,
|
|
7
|
+
)
|
|
8
|
+
from texttools.tools.categorizer.encoder_model.encoder_vectorizer import (
|
|
9
|
+
EmbeddingCategorizer,
|
|
10
|
+
)
|
|
11
|
+
from texttools.tools.categorizer.llm.openai_categorizer import LLMCategorizer
|
|
12
|
+
from texttools.tools.question_detector.llm_detector import LLMQuestionDetector
|
|
13
|
+
from texttools.tools.summarizer import LLMSummarizer
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"LLMQuestionDetector",
|
|
17
|
+
"NoOpResultHandler",
|
|
18
|
+
"PrintResultHandler",
|
|
19
|
+
"ResultHandler",
|
|
20
|
+
"SaveToFileResultHandler",
|
|
21
|
+
"EmbeddingCategorizer",
|
|
22
|
+
"LLMCategorizer",
|
|
23
|
+
"SimpleBatchManager",
|
|
24
|
+
"BatchJobRunner",
|
|
25
|
+
"LLMSummarizer",
|
|
26
|
+
]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from texttools.handlers import NoOpResultHandler, ResultHandler
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BaseCategorizer(ABC):
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
handlers: Optional[list[ResultHandler]] = None,
|
|
13
|
+
):
|
|
14
|
+
"""
|
|
15
|
+
handlers: List of ResultHandler objects that will process results after categorization.
|
|
16
|
+
"""
|
|
17
|
+
self.handlers = handlers or [NoOpResultHandler()]
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def categorize(self, text: str) -> Enum:
|
|
21
|
+
"""
|
|
22
|
+
Categorize the input text.
|
|
23
|
+
Must return one of the Enum members defined in self.categories.
|
|
24
|
+
"""
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
def preprocess(self, text: str) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Optional: Preprocess text before categorization.
|
|
30
|
+
"""
|
|
31
|
+
return text
|
|
32
|
+
|
|
33
|
+
def _dispatch(self, results: dict) -> None:
|
|
34
|
+
for handler in self.handlers:
|
|
35
|
+
try:
|
|
36
|
+
handler.handle(results)
|
|
37
|
+
except Exception:
|
|
38
|
+
logging.error(
|
|
39
|
+
f"Handler {handler.__class__.__name__} failed", exc_info=True
|
|
40
|
+
)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BaseKeywordExtractor(ABC):
|
|
6
|
+
"""
|
|
7
|
+
Base class for all detectors that output a list of keywords.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
handlers: Optional[list[Any]] = None,
|
|
13
|
+
):
|
|
14
|
+
self.handlers = handlers or []
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def extract_keywords(self, text: str) -> list[str]:
|
|
18
|
+
"""
|
|
19
|
+
Extract keywords from the input text.
|
|
20
|
+
Should return a list of strings, where each string is a keyword.
|
|
21
|
+
"""
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
def preprocess(self, text: str) -> str:
|
|
25
|
+
"""
|
|
26
|
+
Optional text preprocessing step.
|
|
27
|
+
"""
|
|
28
|
+
return text.strip()
|
|
29
|
+
|
|
30
|
+
def _dispatch(self, result: dict) -> None:
|
|
31
|
+
"""
|
|
32
|
+
Dispatch the result to handlers.
|
|
33
|
+
"""
|
|
34
|
+
for handler in self.handlers:
|
|
35
|
+
handler.handle(result)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BaseNERExtractor(ABC):
|
|
7
|
+
"""
|
|
8
|
+
Base class for all Named Entity Recognition (NER) systems.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, handlers: Optional[list[Any]] = None):
|
|
12
|
+
"""
|
|
13
|
+
Initializes the BaseNERExtractor with optional result handlers.
|
|
14
|
+
|
|
15
|
+
:param handlers: Optional list of handlers to process the NER results.
|
|
16
|
+
"""
|
|
17
|
+
self.handlers = handlers or []
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def extract_entities(self, text: str) -> list[dict[str, str]]:
|
|
21
|
+
"""
|
|
22
|
+
Extracts named entities from the input text.
|
|
23
|
+
|
|
24
|
+
:param text: The text from which to extract entities.
|
|
25
|
+
:return: A list of dictionaries, where each dictionary represents an entity
|
|
26
|
+
and typically includes 'text' and 'type' keys (e.g.,
|
|
27
|
+
[{"text": "John Doe", "type": "PERSON"}, ...]).
|
|
28
|
+
"""
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
def preprocess(self, text: str) -> str:
|
|
32
|
+
"""
|
|
33
|
+
Optional: Preprocess the input text before entity extraction.
|
|
34
|
+
|
|
35
|
+
:param text: Raw input text.
|
|
36
|
+
:return: Preprocessed text.
|
|
37
|
+
"""
|
|
38
|
+
return text.strip()
|
|
39
|
+
|
|
40
|
+
def _dispatch(
|
|
41
|
+
self, entities: list[dict[str, str]], original_text: Optional[str] = None
|
|
42
|
+
) -> None:
|
|
43
|
+
"""
|
|
44
|
+
Sends the extracted entities to any registered result handlers.
|
|
45
|
+
|
|
46
|
+
:param entities: The list of extracted entities.
|
|
47
|
+
:param original_text: Optionally pass the original text.
|
|
48
|
+
"""
|
|
49
|
+
result_data = {
|
|
50
|
+
"entities": entities,
|
|
51
|
+
}
|
|
52
|
+
if original_text is not None:
|
|
53
|
+
result_data["original_text"] = original_text
|
|
54
|
+
|
|
55
|
+
for handler in self.handlers:
|
|
56
|
+
try:
|
|
57
|
+
handler.handle(result_data)
|
|
58
|
+
except Exception:
|
|
59
|
+
logging.error(
|
|
60
|
+
f"Handler {handler.__class__.__name__} failed", exc_info=True
|
|
61
|
+
)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BaseQuestionDetector(ABC):
|
|
6
|
+
"""
|
|
7
|
+
Base class for all detectors that output a boolean (True/False).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
handlers: Optional[list[Any]] = None,
|
|
13
|
+
):
|
|
14
|
+
self.handlers = handlers or []
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def detect(self, text: str) -> bool:
|
|
18
|
+
"""
|
|
19
|
+
Detect if the input text meets the condition.
|
|
20
|
+
Should return True or False.
|
|
21
|
+
"""
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
def preprocess(self, text: str) -> str:
|
|
25
|
+
"""
|
|
26
|
+
Optional text preprocessing step.
|
|
27
|
+
"""
|
|
28
|
+
return text.strip()
|
|
29
|
+
|
|
30
|
+
def _dispatch(self, result: dict) -> None:
|
|
31
|
+
"""
|
|
32
|
+
Dispatch the result to handlers.
|
|
33
|
+
"""
|
|
34
|
+
for handler in self.handlers:
|
|
35
|
+
handler.handle(result)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BaseQuestionGenerator(ABC):
|
|
7
|
+
"""
|
|
8
|
+
Base class for all systems that generate a question from a given answer.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, handlers: Optional[list[Any]] = None):
|
|
12
|
+
"""
|
|
13
|
+
Initializes the BaseQuestionGenerator with optional result handlers.
|
|
14
|
+
|
|
15
|
+
:param handlers: Optional list of handlers to process the generation results.
|
|
16
|
+
"""
|
|
17
|
+
self.handlers = handlers or []
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def generate_question(self, answer: str) -> str:
|
|
21
|
+
"""
|
|
22
|
+
Generates an appropriate question for the provided answer.
|
|
23
|
+
|
|
24
|
+
:param answer: The answer string for which a question needs to be generated.
|
|
25
|
+
:return: The generated question string.
|
|
26
|
+
"""
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
def preprocess(self, text: str) -> str:
|
|
30
|
+
"""
|
|
31
|
+
Optional: Preprocess the input answer text before question generation.
|
|
32
|
+
|
|
33
|
+
:param text: Raw input answer text.
|
|
34
|
+
:return: Preprocessed text.
|
|
35
|
+
"""
|
|
36
|
+
return text.strip()
|
|
37
|
+
|
|
38
|
+
def _dispatch(self, result_data: dict) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Sends the generated question and original answer to any registered result handlers.
|
|
41
|
+
|
|
42
|
+
:param result_data: A dictionary containing the results (e.g., {"original_answer": ..., "generated_question": ...}).
|
|
43
|
+
"""
|
|
44
|
+
for handler in self.handlers:
|
|
45
|
+
try:
|
|
46
|
+
handler.handle(result_data)
|
|
47
|
+
except Exception:
|
|
48
|
+
logging.error(
|
|
49
|
+
f"Handler {handler.__class__.__name__} failed", exc_info=True
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class BaseQuestionGeneratorFromSubject(ABC):
|
|
54
|
+
"""
|
|
55
|
+
Base class for all systems that generate a question from a given subject
|
|
56
|
+
it will curate some number of questions
|
|
57
|
+
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(self, handlers: Optional[list[Any]] = None):
|
|
61
|
+
"""
|
|
62
|
+
Initializes the BaseQuestionGeneratorFromSubject with optional result handlers.
|
|
63
|
+
|
|
64
|
+
:param handlers: Optional list of handlers to process the generation results.
|
|
65
|
+
"""
|
|
66
|
+
self.handlers = handlers or []
|
|
67
|
+
|
|
68
|
+
@abstractmethod
|
|
69
|
+
def generate_question(self, subject: str) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Generates an appropriate question for the provided answer.
|
|
72
|
+
|
|
73
|
+
:param answer: The answer string for which a question needs to be generated.
|
|
74
|
+
:return: The generated question string.
|
|
75
|
+
"""
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
def preprocess(self, text: str) -> str:
|
|
79
|
+
"""
|
|
80
|
+
Optional: Preprocess the input answer text before question generation.
|
|
81
|
+
|
|
82
|
+
:param text: Raw input answer text.
|
|
83
|
+
:return: Preprocessed text.
|
|
84
|
+
"""
|
|
85
|
+
return text.strip()
|
|
86
|
+
|
|
87
|
+
def _dispatch(self, result_data: dict) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Sends the generated question and original answer to any registered result handlers.
|
|
90
|
+
|
|
91
|
+
:param result_data: A dictionary containing the results (e.g., {"original_answer": ..., "generated_question": ...}).
|
|
92
|
+
"""
|
|
93
|
+
for handler in self.handlers:
|
|
94
|
+
try:
|
|
95
|
+
handler.handle(result_data)
|
|
96
|
+
except Exception:
|
|
97
|
+
logging.error(
|
|
98
|
+
f"Handler {handler.__class__.__name__} failed", exc_info=True
|
|
99
|
+
)
|