contentintelpy 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. contentintelpy-0.1.0/LICENSE +21 -0
  2. contentintelpy-0.1.0/PKG-INFO +156 -0
  3. contentintelpy-0.1.0/README.md +125 -0
  4. contentintelpy-0.1.0/contentintelpy/__init__.py +60 -0
  5. contentintelpy-0.1.0/contentintelpy/nodes/classification_node.py +49 -0
  6. contentintelpy-0.1.0/contentintelpy/nodes/keyword_extract_node.py +78 -0
  7. contentintelpy-0.1.0/contentintelpy/nodes/language_node.py +51 -0
  8. contentintelpy-0.1.0/contentintelpy/nodes/location_node.py +47 -0
  9. contentintelpy-0.1.0/contentintelpy/nodes/ner_node.py +46 -0
  10. contentintelpy-0.1.0/contentintelpy/nodes/sentiment_node.py +74 -0
  11. contentintelpy-0.1.0/contentintelpy/nodes/summarization_node.py +67 -0
  12. contentintelpy-0.1.0/contentintelpy/nodes/translation_node.py +91 -0
  13. contentintelpy-0.1.0/contentintelpy/pipeline/base_node.py +44 -0
  14. contentintelpy-0.1.0/contentintelpy/pipeline/context.py +36 -0
  15. contentintelpy-0.1.0/contentintelpy/pipeline/pipeline.py +30 -0
  16. contentintelpy-0.1.0/contentintelpy/services/ner_service.py +25 -0
  17. contentintelpy-0.1.0/contentintelpy/services/sentiment_service.py +34 -0
  18. contentintelpy-0.1.0/contentintelpy/services/summarization_service.py +25 -0
  19. contentintelpy-0.1.0/contentintelpy/services/translation_service.py +38 -0
  20. contentintelpy-0.1.0/contentintelpy/utils/model_registry.py +126 -0
  21. contentintelpy-0.1.0/contentintelpy.egg-info/PKG-INFO +156 -0
  22. contentintelpy-0.1.0/contentintelpy.egg-info/SOURCES.txt +25 -0
  23. contentintelpy-0.1.0/contentintelpy.egg-info/dependency_links.txt +1 -0
  24. contentintelpy-0.1.0/contentintelpy.egg-info/requires.txt +22 -0
  25. contentintelpy-0.1.0/contentintelpy.egg-info/top_level.txt +1 -0
  26. contentintelpy-0.1.0/pyproject.toml +53 -0
  27. contentintelpy-0.1.0/setup.cfg +4 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 ContentIntelPy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.4
2
+ Name: contentintelpy
3
+ Version: 0.1.0
4
+ Summary: Production-grade NLP library for unified content intelligence.
5
+ Author-email: Ronit Fulari <ronitfulari31@gmail.com>
6
+ License: MIT
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.9
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: numpy>=1.24.0
14
+ Requires-Dist: tqdm>=4.66.0
15
+ Provides-Extra: core
16
+ Requires-Dist: transformers<5.0.0,>=4.30.0; extra == "core"
17
+ Requires-Dist: torch<3.0.0,>=2.0.0; extra == "core"
18
+ Requires-Dist: sentence-transformers>=2.2.0; extra == "core"
19
+ Provides-Extra: ner
20
+ Requires-Dist: spacy>=3.7.0; extra == "ner"
21
+ Requires-Dist: gliner>=0.1.0; extra == "ner"
22
+ Provides-Extra: translation
23
+ Requires-Dist: argostranslate>=1.9.0; extra == "translation"
24
+ Provides-Extra: summarization
25
+ Requires-Dist: sumy>=0.11.0; extra == "summarization"
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest; extra == "dev"
28
+ Requires-Dist: black; extra == "dev"
29
+ Requires-Dist: isort; extra == "dev"
30
+ Dynamic: license-file
31
+
32
+ # contentintelpy
33
+
34
+ **Production-grade NLP library for unified content intelligence.**
35
+
36
+ `contentintelpy` provides a unified, DAG-based engine for multilingual sentiment analysis, NER, translation, and summarization using real transformer models (RoBERTa, GLiNER, NLLB).
37
+
38
+ ## Features
39
+
40
+ - **Real Models**: No heuristics. Uses State-of-the-Art Transformers.
41
+ - Sentiment: RoBERTa
42
+ - NER: GLiNER
43
+ - Translation: NLLB (GPU) + ArgosTranslate (Offline CPU)
44
+ - **Hybrid Execution**: Models download on first run (lazy-loaded). Offline fallback available.
45
+ - **Deterministic Pipelines**: DAG-based execution guarantees order.
46
+ - **Dual API**:
47
+ - **Pipeline-first** for complex workflows.
48
+ - **Service-first** for quick scripts.
49
+ - **Production Ready**: Thread-safe, standard error handling, sparse outputs.
50
+
51
+ ## Installation
52
+
53
+ Install the base library:
54
+ ```bash
55
+ pip install contentintelpy
56
+ ```
57
+
58
+ ### Optional Dependencies (Recommended)
59
+ Since the library uses heavy ML models, you should install the specific components you need:
60
+
61
+ ```bash
62
+ # For all core features
63
+ pip install "contentintelpy[core,ner,translation,summarization]"
64
+
65
+ # For development
66
+ pip install "contentintelpy[dev]"
67
+ ```
68
+
69
+ > [!IMPORTANT]
70
+ > **spaCy Model Requirement**
71
+ > If you use NER or language features, you must install a spaCy model manually:
72
+ > ```bash
73
+ > python -m spacy download en_core_web_sm
74
+ > ```
75
+
76
+ ---
77
+
78
+ ## Quick Start
79
+
80
+ Ideal for simple tasks in notebooks or scripts.
81
+
82
+ ```python
83
+ from contentintelpy import SentimentService, TranslationService
84
+
85
+ # Sentiment
86
+ service = SentimentService()
87
+ result = service.analyze("This library is amazing!")
88
+ print(result)
89
+ # {'value': 'positive', 'confidence': 0.99, ...}
90
+
91
+ # Translation
92
+ translator = TranslationService()
93
+ text = translator.translate("Hola mundo", target="en")
94
+ print(text)
95
+ # "Hello world"
96
+ ```
97
+
98
+ ## Production Usage (Pipeline-First)
99
+
100
+ Recommended for Backends, APIs, and Data Pipelines.
101
+
102
+ ```python
103
+ import contentintelpy as ci
104
+
105
+ # 1. Create the canonical pipeline
106
+ pipeline = ci.create_default_pipeline()
107
+
108
+ # 2. Run it (Thread-safe)
109
+ result = pipeline.run({
110
+ "text": "गूगल ने बेंगलुरु में नया कार्यालय खोला"
111
+ })
112
+
113
+ # 3. Access Sparse Output
114
+ print(result)
115
+ ```
116
+
117
+ **Output Example:**
118
+ ```json
119
+ {
120
+ "text": "...",
121
+ "text_translated": "Google opened a new office in Bengaluru",
122
+ "language": "hi",
123
+ "entities": [
124
+ {"text": "Google", "label": "ORG"},
125
+ {"text": "Bengaluru", "label": "LOC"}
126
+ ],
127
+ "sentiment": {
128
+ "value": "neutral",
129
+ "value_en": "neutral",
130
+ "confidence": 0.95
131
+ },
132
+ "summary": "..."
133
+ }
134
+ ```
135
+
136
+ ## Error Handling
137
+
138
+ Nodes **never crash** the pipeline. Errors are collected in `errors` dict.
139
+
140
+ ```python
141
+ {
142
+ "text": "...",
143
+ "errors": {
144
+ "TranslationNode": "Model download failed: Connection error"
145
+ }
146
+ }
147
+ ```
148
+
149
+ ## Architecture
150
+
151
+ This library is pure logic. It does **NOT** contain:
152
+ - Flask / FastAPI routes
153
+ - Database models
154
+ - Authentication
155
+
156
+ It is designed to be **consumed** by your backend application.
@@ -0,0 +1,125 @@
1
+ # contentintelpy
2
+
3
+ **Production-grade NLP library for unified content intelligence.**
4
+
5
+ `contentintelpy` provides a unified, DAG-based engine for multilingual sentiment analysis, NER, translation, and summarization using real transformer models (RoBERTa, GLiNER, NLLB).
6
+
7
+ ## Features
8
+
9
+ - **Real Models**: No heuristics. Uses State-of-the-Art Transformers.
10
+ - Sentiment: RoBERTa
11
+ - NER: GLiNER
12
+ - Translation: NLLB (GPU) + ArgosTranslate (Offline CPU)
13
+ - **Hybrid Execution**: Models download on first run (lazy-loaded). Offline fallback available.
14
+ - **Deterministic Pipelines**: DAG-based execution guarantees order.
15
+ - **Dual API**:
16
+ - **Pipeline-first** for complex workflows.
17
+ - **Service-first** for quick scripts.
18
+ - **Production Ready**: Thread-safe, standard error handling, sparse outputs.
19
+
20
+ ## Installation
21
+
22
+ Install the base library:
23
+ ```bash
24
+ pip install contentintelpy
25
+ ```
26
+
27
+ ### Optional Dependencies (Recommended)
28
+ Since the library uses heavy ML models, you should install the specific components you need:
29
+
30
+ ```bash
31
+ # For all core features
32
+ pip install "contentintelpy[core,ner,translation,summarization]"
33
+
34
+ # For development
35
+ pip install "contentintelpy[dev]"
36
+ ```
37
+
38
+ > [!IMPORTANT]
39
+ > **spaCy Model Requirement**
40
+ > If you use NER or language features, you must install a spaCy model manually:
41
+ > ```bash
42
+ > python -m spacy download en_core_web_sm
43
+ > ```
44
+
45
+ ---
46
+
47
+ ## Quick Start
48
+
49
+ Ideal for simple tasks in notebooks or scripts.
50
+
51
+ ```python
52
+ from contentintelpy import SentimentService, TranslationService
53
+
54
+ # Sentiment
55
+ service = SentimentService()
56
+ result = service.analyze("This library is amazing!")
57
+ print(result)
58
+ # {'value': 'positive', 'confidence': 0.99, ...}
59
+
60
+ # Translation
61
+ translator = TranslationService()
62
+ text = translator.translate("Hola mundo", target="en")
63
+ print(text)
64
+ # "Hello world"
65
+ ```
66
+
67
+ ## Production Usage (Pipeline-First)
68
+
69
+ Recommended for Backends, APIs, and Data Pipelines.
70
+
71
+ ```python
72
+ import contentintelpy as ci
73
+
74
+ # 1. Create the canonical pipeline
75
+ pipeline = ci.create_default_pipeline()
76
+
77
+ # 2. Run it (Thread-safe)
78
+ result = pipeline.run({
79
+ "text": "गूगल ने बेंगलुरु में नया कार्यालय खोला"
80
+ })
81
+
82
+ # 3. Access Sparse Output
83
+ print(result)
84
+ ```
85
+
86
+ **Output Example:**
87
+ ```json
88
+ {
89
+ "text": "...",
90
+ "text_translated": "Google opened a new office in Bengaluru",
91
+ "language": "hi",
92
+ "entities": [
93
+ {"text": "Google", "label": "ORG"},
94
+ {"text": "Bengaluru", "label": "LOC"}
95
+ ],
96
+ "sentiment": {
97
+ "value": "neutral",
98
+ "value_en": "neutral",
99
+ "confidence": 0.95
100
+ },
101
+ "summary": "..."
102
+ }
103
+ ```
104
+
105
+ ## Error Handling
106
+
107
+ Nodes **never crash** the pipeline. Errors are collected in `errors` dict.
108
+
109
+ ```python
110
+ {
111
+ "text": "...",
112
+ "errors": {
113
+ "TranslationNode": "Model download failed: Connection error"
114
+ }
115
+ }
116
+ ```
117
+
118
+ ## Architecture
119
+
120
+ This library is pure logic. It does **NOT** contain:
121
+ - Flask / FastAPI routes
122
+ - Database models
123
+ - Authentication
124
+
125
+ It is designed to be **consumed** by your backend application.
@@ -0,0 +1,60 @@
1
+ from .pipeline.pipeline import Pipeline
2
+ from .pipeline.context import PipelineContext
3
+ from .pipeline.base_node import Node
4
+
5
+ # Import Nodes for the default pipeline
6
+ from .nodes.language_node import LanguageDetectionNode
7
+ from .nodes.translation_node import TranslationNode
8
+ from .nodes.classification_node import CategoryClassificationNode
9
+ from .nodes.ner_node import NERNode
10
+ from .nodes.location_node import LocationExtractionNode
11
+ from .nodes.sentiment_node import SentimentNode
12
+ from .nodes.keyword_extract_node import KeywordExtractionNode
13
+ from .nodes.summarization_node import SummarizationNode
14
+
15
+ # Import Services for public use
16
+ from .services.sentiment_service import SentimentService
17
+ from .services.translation_service import TranslationService
18
+ from .services.ner_service import NERService
19
+ from .services.summarization_service import SummarizationService
20
+
21
+ def create_default_pipeline() -> Pipeline:
22
+ """
23
+ Creates the canonical ContentIntelPy pipeline.
24
+
25
+ Execution Order:
26
+ 1. Language Detection (Detects source lang)
27
+ 2. Translation (Normalizes to English)
28
+ 3. Classification (Broad categorization)
29
+ 4. NER (Entity discovery)
30
+ 5. Location Extraction (Refines location entities)
31
+ 6. Sentiment (Analyzes tone)
32
+ 7. Keyword Extraction (Highlights)
33
+ 8. Summarization (Reduces content)
34
+
35
+ Returns:
36
+ A configured Pipeline instance ready to run.
37
+ """
38
+ nodes = [
39
+ LanguageDetectionNode(),
40
+ TranslationNode(target_lang="en"), # Normalize to English
41
+ CategoryClassificationNode(),
42
+ NERNode(),
43
+ LocationExtractionNode(),
44
+ SentimentNode(),
45
+ KeywordExtractionNode(),
46
+ SummarizationNode()
47
+ ]
48
+
49
+ return Pipeline(nodes)
50
+
51
+ __all__ = [
52
+ "Pipeline",
53
+ "create_default_pipeline",
54
+ "PipelineContext",
55
+ "Node",
56
+ "SentimentService",
57
+ "TranslationService",
58
+ "NERService",
59
+ "SummarizationService"
60
+ ]
@@ -0,0 +1,49 @@
1
+ from ..pipeline.base_node import Node
2
+ from ..pipeline.context import PipelineContext
3
+ from ..utils.model_registry import registry
4
+ import logging
5
+
6
+ logger = logging.getLogger("contentintelpy.nodes.classification")
7
+
8
+ class CategoryClassificationNode(Node):
9
+ """
10
+ Classifies text into categories using Zero-Shot Classification (BART).
11
+ Default labels: Business, Politics, Sports, Technology, Entertainment, Health, Science.
12
+ """
13
+ DEFAULT_LABELS = [
14
+ "Business", "Politics", "Sports", "Technology",
15
+ "Entertainment", "Health", "Science", "World"
16
+ ]
17
+
18
+ def __init__(self, candidate_labels: list = None):
19
+ super().__init__("CategoryClassificationNode")
20
+ self.candidate_labels = candidate_labels or self.DEFAULT_LABELS
21
+
22
+ def process(self, context: PipelineContext) -> PipelineContext:
23
+ # Prefer translated text if available, else original
24
+ text = context.get("text_translated") or context.get("text")
25
+
26
+ if not text or not isinstance(text, str):
27
+ logger.warning("No text available for category classification.")
28
+ return context
29
+
30
+ try:
31
+ classifier = registry.get_classifier_pipeline()
32
+ # Multi-label=False ensures scores sum to 1
33
+ result = classifier(text, self.candidate_labels, multi_label=False)
34
+
35
+ # Result format: {'labels': ['Sports', ...], 'scores': [0.99, ...]}
36
+ if result and 'labels' in result and 'scores' in result:
37
+ top_label = result['labels'][0]
38
+ top_score = result['scores'][0]
39
+
40
+ context["category"] = top_label
41
+ context["category_score"] = top_score
42
+ context["all_categories"] = dict(zip(result['labels'], result['scores']))
43
+ logger.debug(f"Classified as: {top_label} ({top_score:.2f})")
44
+
45
+ except Exception as e:
46
+ logger.error(f"Classification failed: {e}")
47
+ context.add_error("CategoryClassificationNode", str(e))
48
+
49
+ return context
@@ -0,0 +1,78 @@
1
+ from ..pipeline.base_node import Node
2
+ from ..pipeline.context import PipelineContext
3
+ from ..utils.model_registry import registry
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from sklearn.feature_extraction.text import CountVectorizer
6
+ import numpy as np
7
+ import logging
8
+ import itertools
9
+
10
+ logger = logging.getLogger("contentintelpy.nodes.keywords")
11
+
12
+ class KeywordExtractionNode(Node):
13
+ """
14
+ Extracts keywords using semantic embeddings (KeyBERT-style logic).
15
+
16
+ Algorithm:
17
+ 1. Generate candidate n-grams (1-2 words).
18
+ 2. Embed document and candidates using SentenceTransformer.
19
+ 3. Calculate cosine similarity.
20
+ 4. Return top N candidates.
21
+ """
22
+ def __init__(self, top_n: int = 5):
23
+ super().__init__("KeywordExtractionNode")
24
+ self.top_n = top_n
25
+
26
+ def process(self, context: PipelineContext) -> PipelineContext:
27
+ text = context.get("text_translated") or context.get("text")
28
+
29
+ if not text or len(text.split()) < 3:
30
+ return context
31
+
32
+ try:
33
+ model = registry.get_embedding_model()
34
+
35
+ # 1. Candidate Generation (using simple CountVectorizer)
36
+ # Remove stopwords usually handled by vectorizer, but we use 'english'
37
+ n_gram_range = (1, 2)
38
+ count = CountVectorizer(ngram_range=n_gram_range, stop_words="english").fit([text])
39
+ candidates = count.get_feature_names_out()
40
+
41
+ if len(candidates) == 0:
42
+ return context
43
+
44
+ # 2. Embeddings
45
+ doc_embedding = model.encode([text])
46
+ candidate_embeddings = model.encode(candidates)
47
+
48
+ # 3. Similarity
49
+ distances = cosine_similarity(doc_embedding, candidate_embeddings)
50
+
51
+ # 4. Top N
52
+ keywords = []
53
+ # flatten distances
54
+ distances = distances[0]
55
+
56
+ # Get indices of top_n
57
+ # partition works nicely to get top elements, then we sort them
58
+ keywords_idx = np.argpartition(distances, -self.top_n)[-self.top_n:]
59
+ # Sort by score descending
60
+ keywords_idx = keywords_idx[np.argsort(distances[keywords_idx])][::-1]
61
+
62
+ for idx in keywords_idx:
63
+ keywords.append({
64
+ "text": candidates[idx],
65
+ "score": float(distances[idx])
66
+ })
67
+
68
+ context["keywords"] = keywords
69
+ logger.debug(f"Extracted {len(keywords)} keywords.")
70
+
71
+ except Exception as e:
72
+ # Fallback? Maybe just log error.
73
+ # Ideally could fallback to rake-nltk but we banned it.
74
+ # So we just fail softly.
75
+ logger.error(f"Keyword extraction failed: {e}")
76
+ context.add_error("KeywordExtractionNode", str(e))
77
+
78
+ return context
@@ -0,0 +1,51 @@
1
+ from ..pipeline.base_node import Node
2
+ from ..pipeline.context import PipelineContext
3
+ from ..utils.model_registry import registry
4
+ import logging
5
+
6
+ logger = logging.getLogger("contentintelpy.nodes.language")
7
+
8
+ class LanguageDetectionNode(Node):
9
+ """
10
+ Detects the language of the input text.
11
+ Writes 'language' and 'language_score' to context.
12
+ """
13
+ def __init__(self):
14
+ super().__init__("LanguageDetectionNode")
15
+
16
+ def process(self, context: PipelineContext) -> PipelineContext:
17
+ text = context.get("text")
18
+ if not text or not isinstance(text, str):
19
+ logger.warning("No text found in context for LanguageDetectionNode.")
20
+ context["language"] = "en" # Default to English if no text
21
+ context["language_score"] = 0.0
22
+ return context
23
+
24
+ try:
25
+ detector = registry.get_language_detector()
26
+ # Truncate text for detection speed/limit
27
+ snippet = text[:512]
28
+
29
+ result = detector(snippet)
30
+ # Result format: [{'label': 'en', 'score': 0.99}]
31
+
32
+ if result and len(result) > 0:
33
+ top_result = result[0]
34
+ lang_code = top_result['label']
35
+ score = top_result['score']
36
+
37
+ context["language"] = lang_code
38
+ context["language_score"] = score
39
+ logger.info(f"Detected language: {lang_code} ({score:.2f})")
40
+ else:
41
+ context["language"] = "unknown"
42
+ context["language_score"] = 0.0
43
+
44
+ except Exception as e:
45
+ # Fallback if model fails
46
+ logger.error(f"Language detection model error: {e}")
47
+ context["language"] = "en" # Fallback safe default
48
+ context["language_score"] = 0.0
49
+ raise e # Re-raise to trigger BaseNode error logging
50
+
51
+ return context
@@ -0,0 +1,47 @@
1
+ from ..pipeline.base_node import Node
2
+ from ..pipeline.context import PipelineContext
3
+ import logging
4
+
5
+ logger = logging.getLogger("contentintelpy.nodes.location")
6
+
7
+ class LocationExtractionNode(Node):
8
+ """
9
+ Refines location data from NER results.
10
+ Extracts 'Location', 'City', 'Country' entities into a dedicated 'locations' key.
11
+ Future upgrade: Add actual Geocoding (Lat/Lon) here.
12
+ """
13
+ LOCATION_LABELS = {"Location", "City", "Country", "GPE"}
14
+
15
+ def __init__(self):
16
+ super().__init__("LocationExtractionNode")
17
+
18
+ def process(self, context: PipelineContext) -> PipelineContext:
19
+ entities = context.get("entities", [])
20
+
21
+ if not entities:
22
+ return context
23
+
24
+ locations = []
25
+ seen = set()
26
+
27
+ for ent in entities:
28
+ label = ent.get("label")
29
+ text = ent.get("text")
30
+
31
+ if label in self.LOCATION_LABELS and text:
32
+ # Basic deduplication
33
+ clean_text = text.lower().strip()
34
+ if clean_text not in seen:
35
+ locations.append({
36
+ "name": text,
37
+ "type": label,
38
+ # Placeholder for future geocoding extension:
39
+ # "coordinates": None
40
+ })
41
+ seen.add(clean_text)
42
+
43
+ if locations:
44
+ context["locations"] = locations
45
+ logger.debug(f"Extracted {len(locations)} unique locations.")
46
+
47
+ return context
@@ -0,0 +1,46 @@
1
+ from ..pipeline.base_node import Node
2
+ from ..pipeline.context import PipelineContext
3
+ from ..utils.model_registry import registry
4
+ import logging
5
+
6
+ logger = logging.getLogger("contentintelpy.nodes.ner")
7
+
8
+ class NERNode(Node):
9
+ """
10
+ Named Entity Recognition using GLiNER.
11
+ Extracts: Person, Organization, Location, Date, etc.
12
+ """
13
+ LABELS = ["Person", "Organization", "Location", "City", "Country", "Date"]
14
+
15
+ def __init__(self, labels: list = None):
16
+ super().__init__("NERNode")
17
+ self.labels = labels or self.LABELS
18
+
19
+ def process(self, context: PipelineContext) -> PipelineContext:
20
+ text = context.get("text_translated") or context.get("text")
21
+
22
+ if not text:
23
+ return context
24
+
25
+ try:
26
+ gliner = registry.get_gliner_model()
27
+ # GLiNER predict_entities returns list of dicts: {'text': '', 'label': '', 'score': float}
28
+ entities = gliner.predict_entities(text, self.labels)
29
+
30
+ # Normalize and serialize
31
+ serialized_entities = []
32
+ for ent in entities:
33
+ serialized_entities.append({
34
+ "text": ent["text"],
35
+ "label": ent["label"],
36
+ "score": float(ent.get("score", 0.0))
37
+ })
38
+
39
+ context["entities"] = serialized_entities
40
+ logger.debug(f"Found {len(serialized_entities)} entities.")
41
+
42
+ except Exception as e:
43
+ logger.error(f"NER failed: {e}")
44
+ context.add_error("NERNode", str(e))
45
+
46
+ return context