contentintelpy 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contentintelpy-0.1.0/LICENSE +21 -0
- contentintelpy-0.1.0/PKG-INFO +156 -0
- contentintelpy-0.1.0/README.md +125 -0
- contentintelpy-0.1.0/contentintelpy/__init__.py +60 -0
- contentintelpy-0.1.0/contentintelpy/nodes/classification_node.py +49 -0
- contentintelpy-0.1.0/contentintelpy/nodes/keyword_extract_node.py +78 -0
- contentintelpy-0.1.0/contentintelpy/nodes/language_node.py +51 -0
- contentintelpy-0.1.0/contentintelpy/nodes/location_node.py +47 -0
- contentintelpy-0.1.0/contentintelpy/nodes/ner_node.py +46 -0
- contentintelpy-0.1.0/contentintelpy/nodes/sentiment_node.py +74 -0
- contentintelpy-0.1.0/contentintelpy/nodes/summarization_node.py +67 -0
- contentintelpy-0.1.0/contentintelpy/nodes/translation_node.py +91 -0
- contentintelpy-0.1.0/contentintelpy/pipeline/base_node.py +44 -0
- contentintelpy-0.1.0/contentintelpy/pipeline/context.py +36 -0
- contentintelpy-0.1.0/contentintelpy/pipeline/pipeline.py +30 -0
- contentintelpy-0.1.0/contentintelpy/services/ner_service.py +25 -0
- contentintelpy-0.1.0/contentintelpy/services/sentiment_service.py +34 -0
- contentintelpy-0.1.0/contentintelpy/services/summarization_service.py +25 -0
- contentintelpy-0.1.0/contentintelpy/services/translation_service.py +38 -0
- contentintelpy-0.1.0/contentintelpy/utils/model_registry.py +126 -0
- contentintelpy-0.1.0/contentintelpy.egg-info/PKG-INFO +156 -0
- contentintelpy-0.1.0/contentintelpy.egg-info/SOURCES.txt +25 -0
- contentintelpy-0.1.0/contentintelpy.egg-info/dependency_links.txt +1 -0
- contentintelpy-0.1.0/contentintelpy.egg-info/requires.txt +22 -0
- contentintelpy-0.1.0/contentintelpy.egg-info/top_level.txt +1 -0
- contentintelpy-0.1.0/pyproject.toml +53 -0
- contentintelpy-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 ContentIntelPy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: contentintelpy
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Production-grade NLP library for unified content intelligence.
|
|
5
|
+
Author-email: Ronit Fulari <ronitfulari31@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy>=1.24.0
|
|
14
|
+
Requires-Dist: tqdm>=4.66.0
|
|
15
|
+
Provides-Extra: core
|
|
16
|
+
Requires-Dist: transformers<5.0.0,>=4.30.0; extra == "core"
|
|
17
|
+
Requires-Dist: torch<3.0.0,>=2.0.0; extra == "core"
|
|
18
|
+
Requires-Dist: sentence-transformers>=2.2.0; extra == "core"
|
|
19
|
+
Provides-Extra: ner
|
|
20
|
+
Requires-Dist: spacy>=3.7.0; extra == "ner"
|
|
21
|
+
Requires-Dist: gliner>=0.1.0; extra == "ner"
|
|
22
|
+
Provides-Extra: translation
|
|
23
|
+
Requires-Dist: argostranslate>=1.9.0; extra == "translation"
|
|
24
|
+
Provides-Extra: summarization
|
|
25
|
+
Requires-Dist: sumy>=0.11.0; extra == "summarization"
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest; extra == "dev"
|
|
28
|
+
Requires-Dist: black; extra == "dev"
|
|
29
|
+
Requires-Dist: isort; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# contentintelpy
|
|
33
|
+
|
|
34
|
+
**Production-grade NLP library for unified content intelligence.**
|
|
35
|
+
|
|
36
|
+
`contentintelpy` provides a unified, DAG-based engine for multilingual sentiment analysis, NER, translation, and summarization using real transformer models (RoBERTa, GLiNER, NLLB).
|
|
37
|
+
|
|
38
|
+
## Features
|
|
39
|
+
|
|
40
|
+
- **Real Models**: No heuristics. Uses State-of-the-Art Transformers.
|
|
41
|
+
- Sentiment: RoBERTa
|
|
42
|
+
- NER: GLiNER
|
|
43
|
+
- Translation: NLLB (GPU) + ArgosTranslate (Offline CPU)
|
|
44
|
+
- **Hybrid Execution**: Models download on first run (lazy-loaded). Offline fallback available.
|
|
45
|
+
- **Deterministic Pipelines**: DAG-based execution guarantees order.
|
|
46
|
+
- **Dual API**:
|
|
47
|
+
- **Pipeline-first** for complex workflows.
|
|
48
|
+
- **Service-first** for quick scripts.
|
|
49
|
+
- **Production Ready**: Thread-safe, standard error handling, sparse outputs.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
Install the base library:
|
|
54
|
+
```bash
|
|
55
|
+
pip install contentintelpy
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Optional Dependencies (Recommended)
|
|
59
|
+
Since the library uses heavy ML models, you should install the specific components you need:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# For all core features
|
|
63
|
+
pip install "contentintelpy[core,ner,translation,summarization]"
|
|
64
|
+
|
|
65
|
+
# For development
|
|
66
|
+
pip install "contentintelpy[dev]"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
> [!IMPORTANT]
|
|
70
|
+
> **spaCy Model Requirement**
|
|
71
|
+
> If you use NER or language features, you must install a spaCy model manually:
|
|
72
|
+
> ```bash
|
|
73
|
+
> python -m spacy download en_core_web_sm
|
|
74
|
+
> ```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Quick Start
|
|
79
|
+
|
|
80
|
+
Ideal for simple tasks in notebooks or scripts.
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from contentintelpy import SentimentService, TranslationService
|
|
84
|
+
|
|
85
|
+
# Sentiment
|
|
86
|
+
service = SentimentService()
|
|
87
|
+
result = service.analyze("This library is amazing!")
|
|
88
|
+
print(result)
|
|
89
|
+
# {'value': 'positive', 'confidence': 0.99, ...}
|
|
90
|
+
|
|
91
|
+
# Translation
|
|
92
|
+
translator = TranslationService()
|
|
93
|
+
text = translator.translate("Hola mundo", target="en")
|
|
94
|
+
print(text)
|
|
95
|
+
# "Hello world"
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Production Usage (Pipeline-First)
|
|
99
|
+
|
|
100
|
+
Recommended for Backends, APIs, and Data Pipelines.
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
import contentintelpy as ci
|
|
104
|
+
|
|
105
|
+
# 1. Create the canonical pipeline
|
|
106
|
+
pipeline = ci.create_default_pipeline()
|
|
107
|
+
|
|
108
|
+
# 2. Run it (Thread-safe)
|
|
109
|
+
result = pipeline.run({
|
|
110
|
+
"text": "गूगल ने बेंगलुरु में नया कार्यालय खोला"
|
|
111
|
+
})
|
|
112
|
+
|
|
113
|
+
# 3. Access Sparse Output
|
|
114
|
+
print(result)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
**Output Example:**
|
|
118
|
+
```json
|
|
119
|
+
{
|
|
120
|
+
"text": "...",
|
|
121
|
+
"text_translated": "Google opened a new office in Bengaluru",
|
|
122
|
+
"language": "hi",
|
|
123
|
+
"entities": [
|
|
124
|
+
{"text": "Google", "label": "ORG"},
|
|
125
|
+
{"text": "Bengaluru", "label": "LOC"}
|
|
126
|
+
],
|
|
127
|
+
"sentiment": {
|
|
128
|
+
"value": "neutral",
|
|
129
|
+
"value_en": "neutral",
|
|
130
|
+
"confidence": 0.95
|
|
131
|
+
},
|
|
132
|
+
"summary": "..."
|
|
133
|
+
}
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Error Handling
|
|
137
|
+
|
|
138
|
+
Nodes **never crash** the pipeline. Errors are collected in `errors` dict.
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
{
|
|
142
|
+
"text": "...",
|
|
143
|
+
"errors": {
|
|
144
|
+
"TranslationNode": "Model download failed: Connection error"
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Architecture
|
|
150
|
+
|
|
151
|
+
This library is pure logic. It does **NOT** contain:
|
|
152
|
+
- Flask / FastAPI routes
|
|
153
|
+
- Database models
|
|
154
|
+
- Authentication
|
|
155
|
+
|
|
156
|
+
It is designed to be **consumed** by your backend application.
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# contentintelpy
|
|
2
|
+
|
|
3
|
+
**Production-grade NLP library for unified content intelligence.**
|
|
4
|
+
|
|
5
|
+
`contentintelpy` provides a unified, DAG-based engine for multilingual sentiment analysis, NER, translation, and summarization using real transformer models (RoBERTa, GLiNER, NLLB).
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Real Models**: No heuristics. Uses State-of-the-Art Transformers.
|
|
10
|
+
- Sentiment: RoBERTa
|
|
11
|
+
- NER: GLiNER
|
|
12
|
+
- Translation: NLLB (GPU) + ArgosTranslate (Offline CPU)
|
|
13
|
+
- **Hybrid Execution**: Models download on first run (lazy-loaded). Offline fallback available.
|
|
14
|
+
- **Deterministic Pipelines**: DAG-based execution guarantees order.
|
|
15
|
+
- **Dual API**:
|
|
16
|
+
- **Pipeline-first** for complex workflows.
|
|
17
|
+
- **Service-first** for quick scripts.
|
|
18
|
+
- **Production Ready**: Thread-safe, standard error handling, sparse outputs.
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
Install the base library:
|
|
23
|
+
```bash
|
|
24
|
+
pip install contentintelpy
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### Optional Dependencies (Recommended)
|
|
28
|
+
Since the library uses heavy ML models, you should install the specific components you need:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# For all core features
|
|
32
|
+
pip install "contentintelpy[core,ner,translation,summarization]"
|
|
33
|
+
|
|
34
|
+
# For development
|
|
35
|
+
pip install "contentintelpy[dev]"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
> [!IMPORTANT]
|
|
39
|
+
> **spaCy Model Requirement**
|
|
40
|
+
> If you use NER or language features, you must install a spaCy model manually:
|
|
41
|
+
> ```bash
|
|
42
|
+
> python -m spacy download en_core_web_sm
|
|
43
|
+
> ```
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
Ideal for simple tasks in notebooks or scripts.
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from contentintelpy import SentimentService, TranslationService
|
|
53
|
+
|
|
54
|
+
# Sentiment
|
|
55
|
+
service = SentimentService()
|
|
56
|
+
result = service.analyze("This library is amazing!")
|
|
57
|
+
print(result)
|
|
58
|
+
# {'value': 'positive', 'confidence': 0.99, ...}
|
|
59
|
+
|
|
60
|
+
# Translation
|
|
61
|
+
translator = TranslationService()
|
|
62
|
+
text = translator.translate("Hola mundo", target="en")
|
|
63
|
+
print(text)
|
|
64
|
+
# "Hello world"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Production Usage (Pipeline-First)
|
|
68
|
+
|
|
69
|
+
Recommended for Backends, APIs, and Data Pipelines.
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import contentintelpy as ci
|
|
73
|
+
|
|
74
|
+
# 1. Create the canonical pipeline
|
|
75
|
+
pipeline = ci.create_default_pipeline()
|
|
76
|
+
|
|
77
|
+
# 2. Run it (Thread-safe)
|
|
78
|
+
result = pipeline.run({
|
|
79
|
+
"text": "गूगल ने बेंगलुरु में नया कार्यालय खोला"
|
|
80
|
+
})
|
|
81
|
+
|
|
82
|
+
# 3. Access Sparse Output
|
|
83
|
+
print(result)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
**Output Example:**
|
|
87
|
+
```json
|
|
88
|
+
{
|
|
89
|
+
"text": "...",
|
|
90
|
+
"text_translated": "Google opened a new office in Bengaluru",
|
|
91
|
+
"language": "hi",
|
|
92
|
+
"entities": [
|
|
93
|
+
{"text": "Google", "label": "ORG"},
|
|
94
|
+
{"text": "Bengaluru", "label": "LOC"}
|
|
95
|
+
],
|
|
96
|
+
"sentiment": {
|
|
97
|
+
"value": "neutral",
|
|
98
|
+
"value_en": "neutral",
|
|
99
|
+
"confidence": 0.95
|
|
100
|
+
},
|
|
101
|
+
"summary": "..."
|
|
102
|
+
}
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Error Handling
|
|
106
|
+
|
|
107
|
+
Nodes **never crash** the pipeline. Errors are collected in `errors` dict.
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
{
|
|
111
|
+
"text": "...",
|
|
112
|
+
"errors": {
|
|
113
|
+
"TranslationNode": "Model download failed: Connection error"
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Architecture
|
|
119
|
+
|
|
120
|
+
This library is pure logic. It does **NOT** contain:
|
|
121
|
+
- Flask / FastAPI routes
|
|
122
|
+
- Database models
|
|
123
|
+
- Authentication
|
|
124
|
+
|
|
125
|
+
It is designed to be **consumed** by your backend application.
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from .pipeline.pipeline import Pipeline
|
|
2
|
+
from .pipeline.context import PipelineContext
|
|
3
|
+
from .pipeline.base_node import Node
|
|
4
|
+
|
|
5
|
+
# Import Nodes for the default pipeline
|
|
6
|
+
from .nodes.language_node import LanguageDetectionNode
|
|
7
|
+
from .nodes.translation_node import TranslationNode
|
|
8
|
+
from .nodes.classification_node import CategoryClassificationNode
|
|
9
|
+
from .nodes.ner_node import NERNode
|
|
10
|
+
from .nodes.location_node import LocationExtractionNode
|
|
11
|
+
from .nodes.sentiment_node import SentimentNode
|
|
12
|
+
from .nodes.keyword_extract_node import KeywordExtractionNode
|
|
13
|
+
from .nodes.summarization_node import SummarizationNode
|
|
14
|
+
|
|
15
|
+
# Import Services for public use
|
|
16
|
+
from .services.sentiment_service import SentimentService
|
|
17
|
+
from .services.translation_service import TranslationService
|
|
18
|
+
from .services.ner_service import NERService
|
|
19
|
+
from .services.summarization_service import SummarizationService
|
|
20
|
+
|
|
21
|
+
def create_default_pipeline() -> Pipeline:
|
|
22
|
+
"""
|
|
23
|
+
Creates the canonical ContentIntelPy pipeline.
|
|
24
|
+
|
|
25
|
+
Execution Order:
|
|
26
|
+
1. Language Detection (Detects source lang)
|
|
27
|
+
2. Translation (Normalizes to English)
|
|
28
|
+
3. Classification (Broad categorization)
|
|
29
|
+
4. NER (Entity discovery)
|
|
30
|
+
5. Location Extraction (Refines location entities)
|
|
31
|
+
6. Sentiment (Analyzes tone)
|
|
32
|
+
7. Keyword Extraction (Highlights)
|
|
33
|
+
8. Summarization (Reduces content)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
A configured Pipeline instance ready to run.
|
|
37
|
+
"""
|
|
38
|
+
nodes = [
|
|
39
|
+
LanguageDetectionNode(),
|
|
40
|
+
TranslationNode(target_lang="en"), # Normalize to English
|
|
41
|
+
CategoryClassificationNode(),
|
|
42
|
+
NERNode(),
|
|
43
|
+
LocationExtractionNode(),
|
|
44
|
+
SentimentNode(),
|
|
45
|
+
KeywordExtractionNode(),
|
|
46
|
+
SummarizationNode()
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
return Pipeline(nodes)
|
|
50
|
+
|
|
51
|
+
__all__ = [
|
|
52
|
+
"Pipeline",
|
|
53
|
+
"create_default_pipeline",
|
|
54
|
+
"PipelineContext",
|
|
55
|
+
"Node",
|
|
56
|
+
"SentimentService",
|
|
57
|
+
"TranslationService",
|
|
58
|
+
"NERService",
|
|
59
|
+
"SummarizationService"
|
|
60
|
+
]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from ..pipeline.base_node import Node
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from ..utils.model_registry import registry
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("contentintelpy.nodes.classification")
|
|
7
|
+
|
|
8
|
+
class CategoryClassificationNode(Node):
|
|
9
|
+
"""
|
|
10
|
+
Classifies text into categories using Zero-Shot Classification (BART).
|
|
11
|
+
Default labels: Business, Politics, Sports, Technology, Entertainment, Health, Science.
|
|
12
|
+
"""
|
|
13
|
+
DEFAULT_LABELS = [
|
|
14
|
+
"Business", "Politics", "Sports", "Technology",
|
|
15
|
+
"Entertainment", "Health", "Science", "World"
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
def __init__(self, candidate_labels: list = None):
|
|
19
|
+
super().__init__("CategoryClassificationNode")
|
|
20
|
+
self.candidate_labels = candidate_labels or self.DEFAULT_LABELS
|
|
21
|
+
|
|
22
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
23
|
+
# Prefer translated text if available, else original
|
|
24
|
+
text = context.get("text_translated") or context.get("text")
|
|
25
|
+
|
|
26
|
+
if not text or not isinstance(text, str):
|
|
27
|
+
logger.warning("No text available for category classification.")
|
|
28
|
+
return context
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
classifier = registry.get_classifier_pipeline()
|
|
32
|
+
# Multi-label=False ensures scores sum to 1
|
|
33
|
+
result = classifier(text, self.candidate_labels, multi_label=False)
|
|
34
|
+
|
|
35
|
+
# Result format: {'labels': ['Sports', ...], 'scores': [0.99, ...]}
|
|
36
|
+
if result and 'labels' in result and 'scores' in result:
|
|
37
|
+
top_label = result['labels'][0]
|
|
38
|
+
top_score = result['scores'][0]
|
|
39
|
+
|
|
40
|
+
context["category"] = top_label
|
|
41
|
+
context["category_score"] = top_score
|
|
42
|
+
context["all_categories"] = dict(zip(result['labels'], result['scores']))
|
|
43
|
+
logger.debug(f"Classified as: {top_label} ({top_score:.2f})")
|
|
44
|
+
|
|
45
|
+
except Exception as e:
|
|
46
|
+
logger.error(f"Classification failed: {e}")
|
|
47
|
+
context.add_error("CategoryClassificationNode", str(e))
|
|
48
|
+
|
|
49
|
+
return context
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from ..pipeline.base_node import Node
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from ..utils.model_registry import registry
|
|
4
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
5
|
+
from sklearn.feature_extraction.text import CountVectorizer
|
|
6
|
+
import numpy as np
|
|
7
|
+
import logging
|
|
8
|
+
import itertools
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger("contentintelpy.nodes.keywords")
|
|
11
|
+
|
|
12
|
+
class KeywordExtractionNode(Node):
|
|
13
|
+
"""
|
|
14
|
+
Extracts keywords using semantic embeddings (KeyBERT-style logic).
|
|
15
|
+
|
|
16
|
+
Algorithm:
|
|
17
|
+
1. Generate candidate n-grams (1-2 words).
|
|
18
|
+
2. Embed document and candidates using SentenceTransformer.
|
|
19
|
+
3. Calculate cosine similarity.
|
|
20
|
+
4. Return top N candidates.
|
|
21
|
+
"""
|
|
22
|
+
def __init__(self, top_n: int = 5):
|
|
23
|
+
super().__init__("KeywordExtractionNode")
|
|
24
|
+
self.top_n = top_n
|
|
25
|
+
|
|
26
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
27
|
+
text = context.get("text_translated") or context.get("text")
|
|
28
|
+
|
|
29
|
+
if not text or len(text.split()) < 3:
|
|
30
|
+
return context
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
model = registry.get_embedding_model()
|
|
34
|
+
|
|
35
|
+
# 1. Candidate Generation (using simple CountVectorizer)
|
|
36
|
+
# Remove stopwords usually handled by vectorizer, but we use 'english'
|
|
37
|
+
n_gram_range = (1, 2)
|
|
38
|
+
count = CountVectorizer(ngram_range=n_gram_range, stop_words="english").fit([text])
|
|
39
|
+
candidates = count.get_feature_names_out()
|
|
40
|
+
|
|
41
|
+
if len(candidates) == 0:
|
|
42
|
+
return context
|
|
43
|
+
|
|
44
|
+
# 2. Embeddings
|
|
45
|
+
doc_embedding = model.encode([text])
|
|
46
|
+
candidate_embeddings = model.encode(candidates)
|
|
47
|
+
|
|
48
|
+
# 3. Similarity
|
|
49
|
+
distances = cosine_similarity(doc_embedding, candidate_embeddings)
|
|
50
|
+
|
|
51
|
+
# 4. Top N
|
|
52
|
+
keywords = []
|
|
53
|
+
# flatten distances
|
|
54
|
+
distances = distances[0]
|
|
55
|
+
|
|
56
|
+
# Get indices of top_n
|
|
57
|
+
# partition works nicely to get top elements, then we sort them
|
|
58
|
+
keywords_idx = np.argpartition(distances, -self.top_n)[-self.top_n:]
|
|
59
|
+
# Sort by score descending
|
|
60
|
+
keywords_idx = keywords_idx[np.argsort(distances[keywords_idx])][::-1]
|
|
61
|
+
|
|
62
|
+
for idx in keywords_idx:
|
|
63
|
+
keywords.append({
|
|
64
|
+
"text": candidates[idx],
|
|
65
|
+
"score": float(distances[idx])
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
context["keywords"] = keywords
|
|
69
|
+
logger.debug(f"Extracted {len(keywords)} keywords.")
|
|
70
|
+
|
|
71
|
+
except Exception as e:
|
|
72
|
+
# Fallback? Maybe just log error.
|
|
73
|
+
# Ideally could fallback to rake-nltk but we banned it.
|
|
74
|
+
# So we just fail softly.
|
|
75
|
+
logger.error(f"Keyword extraction failed: {e}")
|
|
76
|
+
context.add_error("KeywordExtractionNode", str(e))
|
|
77
|
+
|
|
78
|
+
return context
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from ..pipeline.base_node import Node
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from ..utils.model_registry import registry
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("contentintelpy.nodes.language")
|
|
7
|
+
|
|
8
|
+
class LanguageDetectionNode(Node):
|
|
9
|
+
"""
|
|
10
|
+
Detects the language of the input text.
|
|
11
|
+
Writes 'language' and 'language_score' to context.
|
|
12
|
+
"""
|
|
13
|
+
def __init__(self):
|
|
14
|
+
super().__init__("LanguageDetectionNode")
|
|
15
|
+
|
|
16
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
17
|
+
text = context.get("text")
|
|
18
|
+
if not text or not isinstance(text, str):
|
|
19
|
+
logger.warning("No text found in context for LanguageDetectionNode.")
|
|
20
|
+
context["language"] = "en" # Default to English if no text
|
|
21
|
+
context["language_score"] = 0.0
|
|
22
|
+
return context
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
detector = registry.get_language_detector()
|
|
26
|
+
# Truncate text for detection speed/limit
|
|
27
|
+
snippet = text[:512]
|
|
28
|
+
|
|
29
|
+
result = detector(snippet)
|
|
30
|
+
# Result format: [{'label': 'en', 'score': 0.99}]
|
|
31
|
+
|
|
32
|
+
if result and len(result) > 0:
|
|
33
|
+
top_result = result[0]
|
|
34
|
+
lang_code = top_result['label']
|
|
35
|
+
score = top_result['score']
|
|
36
|
+
|
|
37
|
+
context["language"] = lang_code
|
|
38
|
+
context["language_score"] = score
|
|
39
|
+
logger.info(f"Detected language: {lang_code} ({score:.2f})")
|
|
40
|
+
else:
|
|
41
|
+
context["language"] = "unknown"
|
|
42
|
+
context["language_score"] = 0.0
|
|
43
|
+
|
|
44
|
+
except Exception as e:
|
|
45
|
+
# Fallback if model fails
|
|
46
|
+
logger.error(f"Language detection model error: {e}")
|
|
47
|
+
context["language"] = "en" # Fallback safe default
|
|
48
|
+
context["language_score"] = 0.0
|
|
49
|
+
raise e # Re-raise to trigger BaseNode error logging
|
|
50
|
+
|
|
51
|
+
return context
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from ..pipeline.base_node import Node
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger("contentintelpy.nodes.location")
|
|
6
|
+
|
|
7
|
+
class LocationExtractionNode(Node):
|
|
8
|
+
"""
|
|
9
|
+
Refines location data from NER results.
|
|
10
|
+
Extracts 'Location', 'City', 'Country' entities into a dedicated 'locations' key.
|
|
11
|
+
Future upgrade: Add actual Geocoding (Lat/Lon) here.
|
|
12
|
+
"""
|
|
13
|
+
LOCATION_LABELS = {"Location", "City", "Country", "GPE"}
|
|
14
|
+
|
|
15
|
+
def __init__(self):
|
|
16
|
+
super().__init__("LocationExtractionNode")
|
|
17
|
+
|
|
18
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
19
|
+
entities = context.get("entities", [])
|
|
20
|
+
|
|
21
|
+
if not entities:
|
|
22
|
+
return context
|
|
23
|
+
|
|
24
|
+
locations = []
|
|
25
|
+
seen = set()
|
|
26
|
+
|
|
27
|
+
for ent in entities:
|
|
28
|
+
label = ent.get("label")
|
|
29
|
+
text = ent.get("text")
|
|
30
|
+
|
|
31
|
+
if label in self.LOCATION_LABELS and text:
|
|
32
|
+
# Basic deduplication
|
|
33
|
+
clean_text = text.lower().strip()
|
|
34
|
+
if clean_text not in seen:
|
|
35
|
+
locations.append({
|
|
36
|
+
"name": text,
|
|
37
|
+
"type": label,
|
|
38
|
+
# Placeholder for future geocoding extension:
|
|
39
|
+
# "coordinates": None
|
|
40
|
+
})
|
|
41
|
+
seen.add(clean_text)
|
|
42
|
+
|
|
43
|
+
if locations:
|
|
44
|
+
context["locations"] = locations
|
|
45
|
+
logger.debug(f"Extracted {len(locations)} unique locations.")
|
|
46
|
+
|
|
47
|
+
return context
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from ..pipeline.base_node import Node
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from ..utils.model_registry import registry
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("contentintelpy.nodes.ner")
|
|
7
|
+
|
|
8
|
+
class NERNode(Node):
|
|
9
|
+
"""
|
|
10
|
+
Named Entity Recognition using GLiNER.
|
|
11
|
+
Extracts: Person, Organization, Location, Date, etc.
|
|
12
|
+
"""
|
|
13
|
+
LABELS = ["Person", "Organization", "Location", "City", "Country", "Date"]
|
|
14
|
+
|
|
15
|
+
def __init__(self, labels: list = None):
|
|
16
|
+
super().__init__("NERNode")
|
|
17
|
+
self.labels = labels or self.LABELS
|
|
18
|
+
|
|
19
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
20
|
+
text = context.get("text_translated") or context.get("text")
|
|
21
|
+
|
|
22
|
+
if not text:
|
|
23
|
+
return context
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
gliner = registry.get_gliner_model()
|
|
27
|
+
# GLiNER predict_entities returns list of dicts: {'text': '', 'label': '', 'score': float}
|
|
28
|
+
entities = gliner.predict_entities(text, self.labels)
|
|
29
|
+
|
|
30
|
+
# Normalize and serialize
|
|
31
|
+
serialized_entities = []
|
|
32
|
+
for ent in entities:
|
|
33
|
+
serialized_entities.append({
|
|
34
|
+
"text": ent["text"],
|
|
35
|
+
"label": ent["label"],
|
|
36
|
+
"score": float(ent.get("score", 0.0))
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
context["entities"] = serialized_entities
|
|
40
|
+
logger.debug(f"Found {len(serialized_entities)} entities.")
|
|
41
|
+
|
|
42
|
+
except Exception as e:
|
|
43
|
+
logger.error(f"NER failed: {e}")
|
|
44
|
+
context.add_error("NERNode", str(e))
|
|
45
|
+
|
|
46
|
+
return context
|