text-summarizer-gi 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- text_summarizer_gi-0.1.0/PKG-INFO +110 -0
- text_summarizer_gi-0.1.0/README.md +100 -0
- text_summarizer_gi-0.1.0/llm_summarizer/__init__.py +4 -0
- text_summarizer_gi-0.1.0/llm_summarizer/chunking.py +11 -0
- text_summarizer_gi-0.1.0/llm_summarizer/embeddings.py +34 -0
- text_summarizer_gi-0.1.0/llm_summarizer/evaluator.py +0 -0
- text_summarizer_gi-0.1.0/llm_summarizer/prompts.py +37 -0
- text_summarizer_gi-0.1.0/llm_summarizer/summarizer.py +196 -0
- text_summarizer_gi-0.1.0/llm_summarizer/token_counter.py +64 -0
- text_summarizer_gi-0.1.0/llm_summarizer/utils.py +47 -0
- text_summarizer_gi-0.1.0/pyproject.toml +17 -0
- text_summarizer_gi-0.1.0/setup.cfg +4 -0
- text_summarizer_gi-0.1.0/text_summarizer_gi.egg-info/PKG-INFO +110 -0
- text_summarizer_gi-0.1.0/text_summarizer_gi.egg-info/SOURCES.txt +15 -0
- text_summarizer_gi-0.1.0/text_summarizer_gi.egg-info/dependency_links.txt +1 -0
- text_summarizer_gi-0.1.0/text_summarizer_gi.egg-info/requires.txt +2 -0
- text_summarizer_gi-0.1.0/text_summarizer_gi.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: text-summarizer-gi
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LLM based context aware text summarizer
|
|
5
|
+
Author-email: Dhivya J <dhivyashankar27@example.com>
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: openai
|
|
9
|
+
Requires-Dist: tiktoken
|
|
10
|
+
|
|
11
|
+
# azure-llm-summarizer
|
|
12
|
+
|
|
13
|
+
A lightweight Python library for summarizing text using **Azure OpenAI**, with built-in **token counting** for both input and output — no extra dependencies beyond `openai`.
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install azure-llm-summarizer
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from llm_summarizer import AzureSummarizer
|
|
25
|
+
|
|
26
|
+
summarizer = AzureSummarizer(
|
|
27
|
+
api_key="<your-azure-api-key>",
|
|
28
|
+
azure_endpoint="https://<your-resource>.openai.azure.com",
|
|
29
|
+
deployment_name="gpt-4o-mini", # your deployed model name
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
result = summarizer.summarize(
|
|
33
|
+
text="Your long document or passage goes here...",
|
|
34
|
+
summary_type="medium", # "short" | "medium" | "detailed"
|
|
35
|
+
tone="neutral", # "neutral" | "formal" | "casual"
|
|
36
|
+
focus_area="general", # "general" | "technical insights" | "financial" ...
|
|
37
|
+
output_format="text", # "text" | "bullets" | "json"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
print(result.summary) # the summary
|
|
41
|
+
print(result.input_tokens) # token count of the original passage
|
|
42
|
+
print(result.output_tokens) # token count of the summary
|
|
43
|
+
print(result) # summary + token counts in one print
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Example Output
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
Generative AI (GenAI) refers to AI systems that create new content by learning
|
|
50
|
+
from large datasets, using architectures like LLMs and diffusion models...
|
|
51
|
+
|
|
52
|
+
[Tokens — Input: 312 | Summary: 47]
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## API Reference
|
|
56
|
+
|
|
57
|
+
### `AzureSummarizer(azure_endpoint, api_key, api_version, deployment_name)`
|
|
58
|
+
|
|
59
|
+
| Parameter | Type | Default | Description |
|
|
60
|
+
|-----------|------|---------|-------------|
|
|
61
|
+
| `azure_endpoint` | `str` | env `AZURE_OPENAI_ENDPOINT` | Azure resource endpoint |
|
|
62
|
+
| `api_key` | `str` | env `AZURE_OPENAI_API_KEY` | Azure API key |
|
|
63
|
+
| `api_version` | `str` | `"2024-02-15-preview"` | API version |
|
|
64
|
+
| `deployment_name` | `str` | `"gi-local-gpt-5-mini"` | Deployed model name |
|
|
65
|
+
|
|
66
|
+
### `.summarize(text, summary_type, tone, focus_area, output_format) → SummaryResult`
|
|
67
|
+
|
|
68
|
+
| Parameter | Options | Default |
|
|
69
|
+
|-----------|---------|---------|
|
|
70
|
+
| `summary_type` | `"short"`, `"medium"`, `"detailed"` | `"medium"` |
|
|
71
|
+
| `tone` | `"neutral"`, `"formal"`, `"casual"` | `"neutral"` |
|
|
72
|
+
| `focus_area` | any string | `"general"` |
|
|
73
|
+
| `output_format` | `"text"`, `"bullets"`, `"json"` | `"text"` |
|
|
74
|
+
|
|
75
|
+
### `SummaryResult`
|
|
76
|
+
|
|
77
|
+
| Attribute | Type | Description |
|
|
78
|
+
|-----------|------|-------------|
|
|
79
|
+
| `summary` | `str` | The generated summary |
|
|
80
|
+
| `input_tokens` | `int` | Estimated tokens in the original text |
|
|
81
|
+
| `output_tokens` | `int` | Estimated tokens in the summary |
|
|
82
|
+
|
|
83
|
+
### Standalone token counter
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from llm_summarizer import count_tokens
|
|
87
|
+
|
|
88
|
+
count_tokens("Hello, world!") # → 4
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Environment Variables
|
|
92
|
+
|
|
93
|
+
You can skip passing credentials directly and use env vars instead:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
export AZURE_OPENAI_ENDPOINT="https://<resource>.openai.azure.com"
|
|
97
|
+
export AZURE_OPENAI_API_KEY="<your-key>"
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Publishing to PyPI
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
pip install build twine
|
|
104
|
+
python -m build
|
|
105
|
+
twine upload dist/*
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## License
|
|
109
|
+
|
|
110
|
+
MIT
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# azure-llm-summarizer
|
|
2
|
+
|
|
3
|
+
A lightweight Python library for summarizing text using **Azure OpenAI**, with built-in **token counting** for both input and output — no extra dependencies beyond `openai`.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install azure-llm-summarizer
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from llm_summarizer import AzureSummarizer
|
|
15
|
+
|
|
16
|
+
summarizer = AzureSummarizer(
|
|
17
|
+
api_key="<your-azure-api-key>",
|
|
18
|
+
azure_endpoint="https://<your-resource>.openai.azure.com",
|
|
19
|
+
deployment_name="gpt-4o-mini", # your deployed model name
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
result = summarizer.summarize(
|
|
23
|
+
text="Your long document or passage goes here...",
|
|
24
|
+
summary_type="medium", # "short" | "medium" | "detailed"
|
|
25
|
+
tone="neutral", # "neutral" | "formal" | "casual"
|
|
26
|
+
focus_area="general", # "general" | "technical insights" | "financial" ...
|
|
27
|
+
output_format="text", # "text" | "bullets" | "json"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
print(result.summary) # the summary
|
|
31
|
+
print(result.input_tokens) # token count of the original passage
|
|
32
|
+
print(result.output_tokens) # token count of the summary
|
|
33
|
+
print(result) # summary + token counts in one print
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Example Output
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
Generative AI (GenAI) refers to AI systems that create new content by learning
|
|
40
|
+
from large datasets, using architectures like LLMs and diffusion models...
|
|
41
|
+
|
|
42
|
+
[Tokens — Input: 312 | Summary: 47]
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## API Reference
|
|
46
|
+
|
|
47
|
+
### `AzureSummarizer(azure_endpoint, api_key, api_version, deployment_name)`
|
|
48
|
+
|
|
49
|
+
| Parameter | Type | Default | Description |
|
|
50
|
+
|-----------|------|---------|-------------|
|
|
51
|
+
| `azure_endpoint` | `str` | env `AZURE_OPENAI_ENDPOINT` | Azure resource endpoint |
|
|
52
|
+
| `api_key` | `str` | env `AZURE_OPENAI_API_KEY` | Azure API key |
|
|
53
|
+
| `api_version` | `str` | `"2024-02-15-preview"` | API version |
|
|
54
|
+
| `deployment_name` | `str` | `"gi-local-gpt-5-mini"` | Deployed model name |
|
|
55
|
+
|
|
56
|
+
### `.summarize(text, summary_type, tone, focus_area, output_format) → SummaryResult`
|
|
57
|
+
|
|
58
|
+
| Parameter | Options | Default |
|
|
59
|
+
|-----------|---------|---------|
|
|
60
|
+
| `summary_type` | `"short"`, `"medium"`, `"detailed"` | `"medium"` |
|
|
61
|
+
| `tone` | `"neutral"`, `"formal"`, `"casual"` | `"neutral"` |
|
|
62
|
+
| `focus_area` | any string | `"general"` |
|
|
63
|
+
| `output_format` | `"text"`, `"bullets"`, `"json"` | `"text"` |
|
|
64
|
+
|
|
65
|
+
### `SummaryResult`
|
|
66
|
+
|
|
67
|
+
| Attribute | Type | Description |
|
|
68
|
+
|-----------|------|-------------|
|
|
69
|
+
| `summary` | `str` | The generated summary |
|
|
70
|
+
| `input_tokens` | `int` | Estimated tokens in the original text |
|
|
71
|
+
| `output_tokens` | `int` | Estimated tokens in the summary |
|
|
72
|
+
|
|
73
|
+
### Standalone token counter
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from llm_summarizer import count_tokens
|
|
77
|
+
|
|
78
|
+
count_tokens("Hello, world!") # → 4
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Environment Variables
|
|
82
|
+
|
|
83
|
+
You can skip passing credentials directly and use env vars instead:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
export AZURE_OPENAI_ENDPOINT="https://<resource>.openai.azure.com"
|
|
87
|
+
export AZURE_OPENAI_API_KEY="<your-key>"
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Publishing to PyPI
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pip install build twine
|
|
94
|
+
python -m build
|
|
95
|
+
twine upload dist/*
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## License
|
|
99
|
+
|
|
100
|
+
MIT
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# llm_summarizer/embeddings.py
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from openai import AzureOpenAI
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AzureEmbeddingClient:
|
|
8
|
+
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
azure_endpoint=None,
|
|
12
|
+
api_key=None,
|
|
13
|
+
api_version="2024-02-15-preview",
|
|
14
|
+
deployment_name="text-embedding-3-large"
|
|
15
|
+
):
|
|
16
|
+
self.azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
|
|
17
|
+
self.api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY")
|
|
18
|
+
self.deployment_name = deployment_name
|
|
19
|
+
|
|
20
|
+
if not self.azure_endpoint or not self.api_key:
|
|
21
|
+
raise ValueError("Azure endpoint and API key must be provided")
|
|
22
|
+
|
|
23
|
+
self.client = AzureOpenAI(
|
|
24
|
+
api_key=self.api_key,
|
|
25
|
+
api_version=api_version,
|
|
26
|
+
azure_endpoint=self.azure_endpoint
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def embed(self, texts):
|
|
30
|
+
response = self.client.embeddings.create(
|
|
31
|
+
model=self.deployment_name,
|
|
32
|
+
input=texts
|
|
33
|
+
)
|
|
34
|
+
return [r.embedding for r in response.data]
|
|
File without changes
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# llm_summarizer/prompts.py
|
|
2
|
+
|
|
3
|
+
SYSTEM_PROMPT = """
|
|
4
|
+
You are a professional AI summarization engine.
|
|
5
|
+
|
|
6
|
+
STRICT RULES:
|
|
7
|
+
1. Preserve factual accuracy.
|
|
8
|
+
2. Do NOT hallucinate or invent information.
|
|
9
|
+
3. Do NOT add external knowledge.
|
|
10
|
+
4. Preserve key entities (names, dates, numbers, metrics).
|
|
11
|
+
5. Maintain logical flow.
|
|
12
|
+
6. If information is unclear, summarize conservatively.
|
|
13
|
+
7. Keep technical terminology intact.
|
|
14
|
+
8. Do not include opinions unless explicitly present in the text.
|
|
15
|
+
9. CRITICAL: Your summary MUST be significantly shorter than the input. Never output more words than the original text.
|
|
16
|
+
10. CRITICAL: Obey the Length Requirement strictly — do not exceed it under any circumstances.
|
|
17
|
+
|
|
18
|
+
Your goal is to produce a concise but information-dense summary.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def build_user_prompt(text, summary_length, tone, focus_area, output_format):
|
|
23
|
+
word_count = len(text.split())
|
|
24
|
+
return f"""
|
|
25
|
+
Summarize the following text.
|
|
26
|
+
|
|
27
|
+
Length Requirement: {summary_length}
|
|
28
|
+
Tone: {tone}
|
|
29
|
+
Focus Area: {focus_area}
|
|
30
|
+
Input word count: {word_count} words — your summary must be well under this count.
|
|
31
|
+
|
|
32
|
+
Output Format:
|
|
33
|
+
{output_format}
|
|
34
|
+
|
|
35
|
+
Text:
|
|
36
|
+
{text}
|
|
37
|
+
"""
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from openai import AzureOpenAI
|
|
4
|
+
|
|
5
|
+
from .chunking import chunk_text
|
|
6
|
+
from .prompts import SYSTEM_PROMPT, build_user_prompt
|
|
7
|
+
from .utils import validate_input, length_mapper, max_tokens_mapper, format_mapper, logger
|
|
8
|
+
from .token_counter import count_tokens
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class SummaryResult:
|
|
13
|
+
"""
|
|
14
|
+
Result returned by :meth:`AzureSummarizer.summarize`.
|
|
15
|
+
|
|
16
|
+
Attributes:
|
|
17
|
+
summary (str):
|
|
18
|
+
The generated summary text.
|
|
19
|
+
input_tokens (int):
|
|
20
|
+
Estimated number of tokens in the original input text.
|
|
21
|
+
output_tokens (int):
|
|
22
|
+
Estimated number of tokens in the generated summary.
|
|
23
|
+
|
|
24
|
+
Example::
|
|
25
|
+
|
|
26
|
+
result = summarizer.summarize("Long article...")
|
|
27
|
+
print(result.summary)
|
|
28
|
+
print(f"Input : {result.input_tokens} tokens")
|
|
29
|
+
print(f"Output: {result.output_tokens} tokens")
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
summary: str
|
|
33
|
+
input_tokens: int
|
|
34
|
+
output_tokens: int
|
|
35
|
+
|
|
36
|
+
def __str__(self) -> str:
|
|
37
|
+
return (
|
|
38
|
+
f"{self.summary}\n\n"
|
|
39
|
+
f"[Tokens — Input: {self.input_tokens} | Summary: {self.output_tokens}]"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class AzureSummarizer:
|
|
44
|
+
"""
|
|
45
|
+
Azure OpenAI-backed text summarizer.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
azure_endpoint (str, optional):
|
|
49
|
+
Azure OpenAI endpoint URL. Falls back to the
|
|
50
|
+
``AZURE_OPENAI_ENDPOINT`` environment variable.
|
|
51
|
+
api_key (str, optional):
|
|
52
|
+
Azure OpenAI API key. Falls back to the
|
|
53
|
+
``AZURE_OPENAI_API_KEY`` environment variable.
|
|
54
|
+
api_version (str):
|
|
55
|
+
Azure OpenAI API version string.
|
|
56
|
+
deployment_name (str):
|
|
57
|
+
Name of the deployed model to use.
|
|
58
|
+
|
|
59
|
+
Example::
|
|
60
|
+
|
|
61
|
+
from llm_summarizer import AzureSummarizer
|
|
62
|
+
|
|
63
|
+
summarizer = AzureSummarizer(
|
|
64
|
+
api_key="<key>",
|
|
65
|
+
azure_endpoint="https://<resource>.openai.azure.com",
|
|
66
|
+
)
|
|
67
|
+
result = summarizer.summarize(
|
|
68
|
+
text="Your long document here...",
|
|
69
|
+
summary_type="medium", # "short" | "medium" | "detailed"
|
|
70
|
+
tone="neutral",
|
|
71
|
+
focus_area="general",
|
|
72
|
+
output_format="text", # "text" | "bullets" | "json"
|
|
73
|
+
)
|
|
74
|
+
print(result) # summary + token counts
|
|
75
|
+
print(result.summary) # summary text only
|
|
76
|
+
print(result.input_tokens) # token count of original text
|
|
77
|
+
print(result.output_tokens) # token count of summary
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
azure_endpoint: str = None,
|
|
83
|
+
api_key: str = None,
|
|
84
|
+
api_version: str = "2024-02-15-preview",
|
|
85
|
+
deployment_name: str = "gi-local-gpt-5-mini",
|
|
86
|
+
):
|
|
87
|
+
self.azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
|
|
88
|
+
self.api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY")
|
|
89
|
+
self.deployment_name = deployment_name
|
|
90
|
+
|
|
91
|
+
if not self.azure_endpoint or not self.api_key:
|
|
92
|
+
raise ValueError("Azure endpoint and API key must be provided")
|
|
93
|
+
|
|
94
|
+
self.client = AzureOpenAI(
|
|
95
|
+
api_key=self.api_key,
|
|
96
|
+
api_version=api_version,
|
|
97
|
+
azure_endpoint=self.azure_endpoint,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
def summarize(
|
|
101
|
+
self,
|
|
102
|
+
text: str,
|
|
103
|
+
summary_type: str = "medium",
|
|
104
|
+
tone: str = "neutral",
|
|
105
|
+
focus_area: str = "general",
|
|
106
|
+
output_format: str = "text",
|
|
107
|
+
) -> SummaryResult:
|
|
108
|
+
"""
|
|
109
|
+
Summarize *text* and return a :class:`SummaryResult` containing
|
|
110
|
+
the summary and token counts for both the input and the output.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
text (str):
|
|
114
|
+
The passage to summarise.
|
|
115
|
+
summary_type (str):
|
|
116
|
+
Desired length -- ``"short"``, ``"medium"``, or
|
|
117
|
+
``"detailed"``. Defaults to ``"medium"``.
|
|
118
|
+
tone (str):
|
|
119
|
+
Writing tone, e.g. ``"neutral"``, ``"formal"``,
|
|
120
|
+
``"casual"``. Defaults to ``"neutral"``.
|
|
121
|
+
focus_area (str):
|
|
122
|
+
Topic lens, e.g. ``"general"``, ``"technical insights"``,
|
|
123
|
+
``"financial"``. Defaults to ``"general"``.
|
|
124
|
+
output_format (str):
|
|
125
|
+
Output structure -- ``"text"``, ``"bullets"``, or
|
|
126
|
+
``"json"``. Defaults to ``"text"``.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
:class:`SummaryResult`:
|
|
130
|
+
Object with ``summary``, ``input_tokens``, and
|
|
131
|
+
``output_tokens`` attributes.
|
|
132
|
+
"""
|
|
133
|
+
should_continue = validate_input(text)
|
|
134
|
+
input_tokens = count_tokens(text)
|
|
135
|
+
|
|
136
|
+
if not should_continue:
|
|
137
|
+
return SummaryResult(
|
|
138
|
+
summary=text,
|
|
139
|
+
input_tokens=input_tokens,
|
|
140
|
+
output_tokens=count_tokens(text),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
summary_length = length_mapper(summary_type)
|
|
144
|
+
max_tokens = max_tokens_mapper(summary_type)
|
|
145
|
+
format_instruction = format_mapper(output_format)
|
|
146
|
+
chunks = chunk_text(text)
|
|
147
|
+
|
|
148
|
+
logger.info(f"Total chunks: {len(chunks)}")
|
|
149
|
+
|
|
150
|
+
if len(chunks) == 1:
|
|
151
|
+
summary_text = self._summarize_chunk(
|
|
152
|
+
chunks[0], summary_length, tone, focus_area, format_instruction, max_tokens
|
|
153
|
+
)
|
|
154
|
+
else:
|
|
155
|
+
partial_summaries = [
|
|
156
|
+
self._summarize_chunk(
|
|
157
|
+
chunk, summary_length, tone, focus_area, format_instruction, max_tokens
|
|
158
|
+
)
|
|
159
|
+
for chunk in chunks
|
|
160
|
+
]
|
|
161
|
+
combined = " ".join(partial_summaries)
|
|
162
|
+
logger.info("Running reduce summarization")
|
|
163
|
+
summary_text = self._summarize_chunk(
|
|
164
|
+
combined, summary_length, tone, focus_area, format_instruction, max_tokens
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
output_tokens = count_tokens(summary_text)
|
|
168
|
+
logger.info(f"Token counts -- input: {input_tokens}, summary: {output_tokens}")
|
|
169
|
+
|
|
170
|
+
return SummaryResult(
|
|
171
|
+
summary=summary_text,
|
|
172
|
+
input_tokens=input_tokens,
|
|
173
|
+
output_tokens=output_tokens,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def _summarize_chunk(
|
|
177
|
+
self,
|
|
178
|
+
text: str,
|
|
179
|
+
summary_length: str,
|
|
180
|
+
tone: str,
|
|
181
|
+
focus_area: str,
|
|
182
|
+
format_instruction: str,
|
|
183
|
+
max_tokens: int = 300,
|
|
184
|
+
) -> str:
|
|
185
|
+
user_prompt = build_user_prompt(
|
|
186
|
+
text, summary_length, tone, focus_area, format_instruction
|
|
187
|
+
)
|
|
188
|
+
response = self.client.chat.completions.create(
|
|
189
|
+
model=self.deployment_name,
|
|
190
|
+
max_completion_tokens=max_tokens,
|
|
191
|
+
messages=[
|
|
192
|
+
{"role": "system", "content": SYSTEM_PROMPT},
|
|
193
|
+
{"role": "user", "content": user_prompt},
|
|
194
|
+
],
|
|
195
|
+
)
|
|
196
|
+
return response.choices[0].message.content.strip()
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""
|
|
2
|
+
token_counter.py
|
|
3
|
+
|
|
4
|
+
Lightweight token estimation without external dependencies.
|
|
5
|
+
|
|
6
|
+
Uses a character/word-based heuristic that closely approximates
|
|
7
|
+
GPT-family tokenizers:
|
|
8
|
+
- ~4 characters per token on average for English prose
|
|
9
|
+
- Punctuation and whitespace are counted separately
|
|
10
|
+
- Numbers and special characters are weighted accordingly
|
|
11
|
+
|
|
12
|
+
This gives results within ~5% of tiktoken for typical English text.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _estimate_tokens(text: str) -> int:
|
|
19
|
+
"""
|
|
20
|
+
Estimate token count using a rule-based heuristic that mirrors
|
|
21
|
+
BPE tokenizer behaviour without requiring external libraries.
|
|
22
|
+
|
|
23
|
+
Rules (derived from OpenAI tokenizer patterns):
|
|
24
|
+
1. Split on whitespace boundaries.
|
|
25
|
+
2. Each word ≈ ceil(len(word) / 4) tokens.
|
|
26
|
+
3. Punctuation characters each count as 1 token.
|
|
27
|
+
4. Numbers: each digit group ≈ 1 token.
|
|
28
|
+
"""
|
|
29
|
+
if not text:
|
|
30
|
+
return 0
|
|
31
|
+
|
|
32
|
+
# Separate punctuation from words
|
|
33
|
+
tokens = re.findall(r"\d+|\w+|[^\w\s]", text)
|
|
34
|
+
|
|
35
|
+
count = 0
|
|
36
|
+
for tok in tokens:
|
|
37
|
+
if re.fullmatch(r"\d+", tok):
|
|
38
|
+
# Numbers: roughly 1 token per 3 digits
|
|
39
|
+
count += max(1, (len(tok) + 2) // 3)
|
|
40
|
+
elif re.fullmatch(r"[^\w\s]", tok):
|
|
41
|
+
# Punctuation: 1 token each
|
|
42
|
+
count += 1
|
|
43
|
+
else:
|
|
44
|
+
# Words: ~4 chars per token
|
|
45
|
+
count += max(1, (len(tok) + 3) // 4)
|
|
46
|
+
|
|
47
|
+
return count
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def count_tokens(text: str) -> int:
|
|
51
|
+
"""
|
|
52
|
+
Return the estimated token count for a given string.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
text: Input text to count tokens for.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Estimated number of tokens (integer).
|
|
59
|
+
|
|
60
|
+
Example:
|
|
61
|
+
>>> count_tokens("Hello, world!")
|
|
62
|
+
4
|
|
63
|
+
"""
|
|
64
|
+
return _estimate_tokens(text)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
logging.basicConfig(level=logging.INFO)
|
|
4
|
+
logger = logging.getLogger("llm_summarizer")
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def validate_input(text: str):
|
|
8
|
+
if not text or not text.strip():
|
|
9
|
+
raise ValueError("Input text cannot be empty.")
|
|
10
|
+
|
|
11
|
+
if len(text) < 50:
|
|
12
|
+
logger.info("Text too short to summarize. Returning original.")
|
|
13
|
+
return False
|
|
14
|
+
|
|
15
|
+
return True
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def length_mapper(length_type: str):
|
|
19
|
+
mapping = {
|
|
20
|
+
"short": "3-4 sentences. Be extremely concise.",
|
|
21
|
+
"medium": "1 concise paragraph (5-7 sentences max).",
|
|
22
|
+
"detailed": "Multiple paragraphs with key insights, but no longer than necessary."
|
|
23
|
+
}
|
|
24
|
+
return mapping.get(length_type, mapping["medium"])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def max_tokens_mapper(length_type: str) -> int:
|
|
28
|
+
"""Return a hard token cap for the API call based on summary type."""
|
|
29
|
+
mapping = {
|
|
30
|
+
"short": 120,
|
|
31
|
+
"medium": 300,
|
|
32
|
+
"detailed": 700,
|
|
33
|
+
}
|
|
34
|
+
return mapping.get(length_type, 300)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def format_mapper(output_format: str):
|
|
38
|
+
formats = {
|
|
39
|
+
"text": "Return plain paragraph text.",
|
|
40
|
+
"bullets": "Return bullet points.",
|
|
41
|
+
"json": """Return JSON format:
|
|
42
|
+
{
|
|
43
|
+
"summary": "",
|
|
44
|
+
"key_points": []
|
|
45
|
+
}"""
|
|
46
|
+
}
|
|
47
|
+
return formats.get(output_format, formats["text"])
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "text-summarizer-gi"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "LLM based context aware text summarizer"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "Dhivya J", email = "dhivyashankar27@example.com" }
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"openai",
|
|
16
|
+
"tiktoken"
|
|
17
|
+
]
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: text-summarizer-gi
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LLM based context aware text summarizer
|
|
5
|
+
Author-email: Dhivya J <dhivyashankar27@example.com>
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: openai
|
|
9
|
+
Requires-Dist: tiktoken
|
|
10
|
+
|
|
11
|
+
# azure-llm-summarizer
|
|
12
|
+
|
|
13
|
+
A lightweight Python library for summarizing text using **Azure OpenAI**, with built-in **token counting** for both input and output — no extra dependencies beyond `openai`.
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install azure-llm-summarizer
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from llm_summarizer import AzureSummarizer
|
|
25
|
+
|
|
26
|
+
summarizer = AzureSummarizer(
|
|
27
|
+
api_key="<your-azure-api-key>",
|
|
28
|
+
azure_endpoint="https://<your-resource>.openai.azure.com",
|
|
29
|
+
deployment_name="gpt-4o-mini", # your deployed model name
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
result = summarizer.summarize(
|
|
33
|
+
text="Your long document or passage goes here...",
|
|
34
|
+
summary_type="medium", # "short" | "medium" | "detailed"
|
|
35
|
+
tone="neutral", # "neutral" | "formal" | "casual"
|
|
36
|
+
focus_area="general", # "general" | "technical insights" | "financial" ...
|
|
37
|
+
output_format="text", # "text" | "bullets" | "json"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
print(result.summary) # the summary
|
|
41
|
+
print(result.input_tokens) # token count of the original passage
|
|
42
|
+
print(result.output_tokens) # token count of the summary
|
|
43
|
+
print(result) # summary + token counts in one print
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Example Output
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
Generative AI (GenAI) refers to AI systems that create new content by learning
|
|
50
|
+
from large datasets, using architectures like LLMs and diffusion models...
|
|
51
|
+
|
|
52
|
+
[Tokens — Input: 312 | Summary: 47]
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## API Reference
|
|
56
|
+
|
|
57
|
+
### `AzureSummarizer(azure_endpoint, api_key, api_version, deployment_name)`
|
|
58
|
+
|
|
59
|
+
| Parameter | Type | Default | Description |
|
|
60
|
+
|-----------|------|---------|-------------|
|
|
61
|
+
| `azure_endpoint` | `str` | env `AZURE_OPENAI_ENDPOINT` | Azure resource endpoint |
|
|
62
|
+
| `api_key` | `str` | env `AZURE_OPENAI_API_KEY` | Azure API key |
|
|
63
|
+
| `api_version` | `str` | `"2024-02-15-preview"` | API version |
|
|
64
|
+
| `deployment_name` | `str` | `"gi-local-gpt-5-mini"` | Deployed model name |
|
|
65
|
+
|
|
66
|
+
### `.summarize(text, summary_type, tone, focus_area, output_format) → SummaryResult`
|
|
67
|
+
|
|
68
|
+
| Parameter | Options | Default |
|
|
69
|
+
|-----------|---------|---------|
|
|
70
|
+
| `summary_type` | `"short"`, `"medium"`, `"detailed"` | `"medium"` |
|
|
71
|
+
| `tone` | `"neutral"`, `"formal"`, `"casual"` | `"neutral"` |
|
|
72
|
+
| `focus_area` | any string | `"general"` |
|
|
73
|
+
| `output_format` | `"text"`, `"bullets"`, `"json"` | `"text"` |
|
|
74
|
+
|
|
75
|
+
### `SummaryResult`
|
|
76
|
+
|
|
77
|
+
| Attribute | Type | Description |
|
|
78
|
+
|-----------|------|-------------|
|
|
79
|
+
| `summary` | `str` | The generated summary |
|
|
80
|
+
| `input_tokens` | `int` | Estimated tokens in the original text |
|
|
81
|
+
| `output_tokens` | `int` | Estimated tokens in the summary |
|
|
82
|
+
|
|
83
|
+
### Standalone token counter
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from llm_summarizer import count_tokens
|
|
87
|
+
|
|
88
|
+
count_tokens("Hello, world!") # → 4
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Environment Variables
|
|
92
|
+
|
|
93
|
+
You can skip passing credentials directly and use env vars instead:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
export AZURE_OPENAI_ENDPOINT="https://<resource>.openai.azure.com"
|
|
97
|
+
export AZURE_OPENAI_API_KEY="<your-key>"
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Publishing to PyPI
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
pip install build twine
|
|
104
|
+
python -m build
|
|
105
|
+
twine upload dist/*
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## License
|
|
109
|
+
|
|
110
|
+
MIT
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
llm_summarizer/__init__.py
|
|
4
|
+
llm_summarizer/chunking.py
|
|
5
|
+
llm_summarizer/embeddings.py
|
|
6
|
+
llm_summarizer/evaluator.py
|
|
7
|
+
llm_summarizer/prompts.py
|
|
8
|
+
llm_summarizer/summarizer.py
|
|
9
|
+
llm_summarizer/token_counter.py
|
|
10
|
+
llm_summarizer/utils.py
|
|
11
|
+
text_summarizer_gi.egg-info/PKG-INFO
|
|
12
|
+
text_summarizer_gi.egg-info/SOURCES.txt
|
|
13
|
+
text_summarizer_gi.egg-info/dependency_links.txt
|
|
14
|
+
text_summarizer_gi.egg-info/requires.txt
|
|
15
|
+
text_summarizer_gi.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
llm_summarizer
|