contentintelpy 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contentintelpy/__init__.py +60 -0
- contentintelpy/nodes/classification_node.py +49 -0
- contentintelpy/nodes/keyword_extract_node.py +78 -0
- contentintelpy/nodes/language_node.py +51 -0
- contentintelpy/nodes/location_node.py +47 -0
- contentintelpy/nodes/ner_node.py +46 -0
- contentintelpy/nodes/sentiment_node.py +74 -0
- contentintelpy/nodes/summarization_node.py +67 -0
- contentintelpy/nodes/translation_node.py +91 -0
- contentintelpy/pipeline/base_node.py +44 -0
- contentintelpy/pipeline/context.py +36 -0
- contentintelpy/pipeline/pipeline.py +30 -0
- contentintelpy/services/ner_service.py +25 -0
- contentintelpy/services/sentiment_service.py +34 -0
- contentintelpy/services/summarization_service.py +25 -0
- contentintelpy/services/translation_service.py +38 -0
- contentintelpy/utils/model_registry.py +126 -0
- contentintelpy-0.1.0.dist-info/METADATA +156 -0
- contentintelpy-0.1.0.dist-info/RECORD +22 -0
- contentintelpy-0.1.0.dist-info/WHEEL +5 -0
- contentintelpy-0.1.0.dist-info/licenses/LICENSE +21 -0
- contentintelpy-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from .pipeline.pipeline import Pipeline
|
|
2
|
+
from .pipeline.context import PipelineContext
|
|
3
|
+
from .pipeline.base_node import Node
|
|
4
|
+
|
|
5
|
+
# Import Nodes for the default pipeline
|
|
6
|
+
from .nodes.language_node import LanguageDetectionNode
|
|
7
|
+
from .nodes.translation_node import TranslationNode
|
|
8
|
+
from .nodes.classification_node import CategoryClassificationNode
|
|
9
|
+
from .nodes.ner_node import NERNode
|
|
10
|
+
from .nodes.location_node import LocationExtractionNode
|
|
11
|
+
from .nodes.sentiment_node import SentimentNode
|
|
12
|
+
from .nodes.keyword_extract_node import KeywordExtractionNode
|
|
13
|
+
from .nodes.summarization_node import SummarizationNode
|
|
14
|
+
|
|
15
|
+
# Import Services for public use
|
|
16
|
+
from .services.sentiment_service import SentimentService
|
|
17
|
+
from .services.translation_service import TranslationService
|
|
18
|
+
from .services.ner_service import NERService
|
|
19
|
+
from .services.summarization_service import SummarizationService
|
|
20
|
+
|
|
21
|
+
def create_default_pipeline() -> Pipeline:
|
|
22
|
+
"""
|
|
23
|
+
Creates the canonical ContentIntelPy pipeline.
|
|
24
|
+
|
|
25
|
+
Execution Order:
|
|
26
|
+
1. Language Detection (Detects source lang)
|
|
27
|
+
2. Translation (Normalizes to English)
|
|
28
|
+
3. Classification (Broad categorization)
|
|
29
|
+
4. NER (Entity discovery)
|
|
30
|
+
5. Location Extraction (Refines location entities)
|
|
31
|
+
6. Sentiment (Analyzes tone)
|
|
32
|
+
7. Keyword Extraction (Highlights)
|
|
33
|
+
8. Summarization (Reduces content)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
A configured Pipeline instance ready to run.
|
|
37
|
+
"""
|
|
38
|
+
nodes = [
|
|
39
|
+
LanguageDetectionNode(),
|
|
40
|
+
TranslationNode(target_lang="en"), # Normalize to English
|
|
41
|
+
CategoryClassificationNode(),
|
|
42
|
+
NERNode(),
|
|
43
|
+
LocationExtractionNode(),
|
|
44
|
+
SentimentNode(),
|
|
45
|
+
KeywordExtractionNode(),
|
|
46
|
+
SummarizationNode()
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
return Pipeline(nodes)
|
|
50
|
+
|
|
51
|
+
__all__ = [
|
|
52
|
+
"Pipeline",
|
|
53
|
+
"create_default_pipeline",
|
|
54
|
+
"PipelineContext",
|
|
55
|
+
"Node",
|
|
56
|
+
"SentimentService",
|
|
57
|
+
"TranslationService",
|
|
58
|
+
"NERService",
|
|
59
|
+
"SummarizationService"
|
|
60
|
+
]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from ..pipeline.base_node import Node
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from ..utils.model_registry import registry
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("contentintelpy.nodes.classification")
|
|
7
|
+
|
|
8
|
+
class CategoryClassificationNode(Node):
|
|
9
|
+
"""
|
|
10
|
+
Classifies text into categories using Zero-Shot Classification (BART).
|
|
11
|
+
Default labels: Business, Politics, Sports, Technology, Entertainment, Health, Science.
|
|
12
|
+
"""
|
|
13
|
+
DEFAULT_LABELS = [
|
|
14
|
+
"Business", "Politics", "Sports", "Technology",
|
|
15
|
+
"Entertainment", "Health", "Science", "World"
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
def __init__(self, candidate_labels: list = None):
|
|
19
|
+
super().__init__("CategoryClassificationNode")
|
|
20
|
+
self.candidate_labels = candidate_labels or self.DEFAULT_LABELS
|
|
21
|
+
|
|
22
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
23
|
+
# Prefer translated text if available, else original
|
|
24
|
+
text = context.get("text_translated") or context.get("text")
|
|
25
|
+
|
|
26
|
+
if not text or not isinstance(text, str):
|
|
27
|
+
logger.warning("No text available for category classification.")
|
|
28
|
+
return context
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
classifier = registry.get_classifier_pipeline()
|
|
32
|
+
# Multi-label=False ensures scores sum to 1
|
|
33
|
+
result = classifier(text, self.candidate_labels, multi_label=False)
|
|
34
|
+
|
|
35
|
+
# Result format: {'labels': ['Sports', ...], 'scores': [0.99, ...]}
|
|
36
|
+
if result and 'labels' in result and 'scores' in result:
|
|
37
|
+
top_label = result['labels'][0]
|
|
38
|
+
top_score = result['scores'][0]
|
|
39
|
+
|
|
40
|
+
context["category"] = top_label
|
|
41
|
+
context["category_score"] = top_score
|
|
42
|
+
context["all_categories"] = dict(zip(result['labels'], result['scores']))
|
|
43
|
+
logger.debug(f"Classified as: {top_label} ({top_score:.2f})")
|
|
44
|
+
|
|
45
|
+
except Exception as e:
|
|
46
|
+
logger.error(f"Classification failed: {e}")
|
|
47
|
+
context.add_error("CategoryClassificationNode", str(e))
|
|
48
|
+
|
|
49
|
+
return context
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from ..pipeline.base_node import Node
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from ..utils.model_registry import registry
|
|
4
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
5
|
+
from sklearn.feature_extraction.text import CountVectorizer
|
|
6
|
+
import numpy as np
|
|
7
|
+
import logging
|
|
8
|
+
import itertools
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger("contentintelpy.nodes.keywords")
|
|
11
|
+
|
|
12
|
+
class KeywordExtractionNode(Node):
|
|
13
|
+
"""
|
|
14
|
+
Extracts keywords using semantic embeddings (KeyBERT-style logic).
|
|
15
|
+
|
|
16
|
+
Algorithm:
|
|
17
|
+
1. Generate candidate n-grams (1-2 words).
|
|
18
|
+
2. Embed document and candidates using SentenceTransformer.
|
|
19
|
+
3. Calculate cosine similarity.
|
|
20
|
+
4. Return top N candidates.
|
|
21
|
+
"""
|
|
22
|
+
def __init__(self, top_n: int = 5):
|
|
23
|
+
super().__init__("KeywordExtractionNode")
|
|
24
|
+
self.top_n = top_n
|
|
25
|
+
|
|
26
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
27
|
+
text = context.get("text_translated") or context.get("text")
|
|
28
|
+
|
|
29
|
+
if not text or len(text.split()) < 3:
|
|
30
|
+
return context
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
model = registry.get_embedding_model()
|
|
34
|
+
|
|
35
|
+
# 1. Candidate Generation (using simple CountVectorizer)
|
|
36
|
+
# Remove stopwords usually handled by vectorizer, but we use 'english'
|
|
37
|
+
n_gram_range = (1, 2)
|
|
38
|
+
count = CountVectorizer(ngram_range=n_gram_range, stop_words="english").fit([text])
|
|
39
|
+
candidates = count.get_feature_names_out()
|
|
40
|
+
|
|
41
|
+
if len(candidates) == 0:
|
|
42
|
+
return context
|
|
43
|
+
|
|
44
|
+
# 2. Embeddings
|
|
45
|
+
doc_embedding = model.encode([text])
|
|
46
|
+
candidate_embeddings = model.encode(candidates)
|
|
47
|
+
|
|
48
|
+
# 3. Similarity
|
|
49
|
+
distances = cosine_similarity(doc_embedding, candidate_embeddings)
|
|
50
|
+
|
|
51
|
+
# 4. Top N
|
|
52
|
+
keywords = []
|
|
53
|
+
# flatten distances
|
|
54
|
+
distances = distances[0]
|
|
55
|
+
|
|
56
|
+
# Get indices of top_n
|
|
57
|
+
# partition works nicely to get top elements, then we sort them
|
|
58
|
+
keywords_idx = np.argpartition(distances, -self.top_n)[-self.top_n:]
|
|
59
|
+
# Sort by score descending
|
|
60
|
+
keywords_idx = keywords_idx[np.argsort(distances[keywords_idx])][::-1]
|
|
61
|
+
|
|
62
|
+
for idx in keywords_idx:
|
|
63
|
+
keywords.append({
|
|
64
|
+
"text": candidates[idx],
|
|
65
|
+
"score": float(distances[idx])
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
context["keywords"] = keywords
|
|
69
|
+
logger.debug(f"Extracted {len(keywords)} keywords.")
|
|
70
|
+
|
|
71
|
+
except Exception as e:
|
|
72
|
+
# Fallback? Maybe just log error.
|
|
73
|
+
# Ideally could fallback to rake-nltk but we banned it.
|
|
74
|
+
# So we just fail softly.
|
|
75
|
+
logger.error(f"Keyword extraction failed: {e}")
|
|
76
|
+
context.add_error("KeywordExtractionNode", str(e))
|
|
77
|
+
|
|
78
|
+
return context
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from ..pipeline.base_node import Node
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from ..utils.model_registry import registry
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("contentintelpy.nodes.language")
|
|
7
|
+
|
|
8
|
+
class LanguageDetectionNode(Node):
|
|
9
|
+
"""
|
|
10
|
+
Detects the language of the input text.
|
|
11
|
+
Writes 'language' and 'language_score' to context.
|
|
12
|
+
"""
|
|
13
|
+
def __init__(self):
|
|
14
|
+
super().__init__("LanguageDetectionNode")
|
|
15
|
+
|
|
16
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
17
|
+
text = context.get("text")
|
|
18
|
+
if not text or not isinstance(text, str):
|
|
19
|
+
logger.warning("No text found in context for LanguageDetectionNode.")
|
|
20
|
+
context["language"] = "en" # Default to English if no text
|
|
21
|
+
context["language_score"] = 0.0
|
|
22
|
+
return context
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
detector = registry.get_language_detector()
|
|
26
|
+
# Truncate text for detection speed/limit
|
|
27
|
+
snippet = text[:512]
|
|
28
|
+
|
|
29
|
+
result = detector(snippet)
|
|
30
|
+
# Result format: [{'label': 'en', 'score': 0.99}]
|
|
31
|
+
|
|
32
|
+
if result and len(result) > 0:
|
|
33
|
+
top_result = result[0]
|
|
34
|
+
lang_code = top_result['label']
|
|
35
|
+
score = top_result['score']
|
|
36
|
+
|
|
37
|
+
context["language"] = lang_code
|
|
38
|
+
context["language_score"] = score
|
|
39
|
+
logger.info(f"Detected language: {lang_code} ({score:.2f})")
|
|
40
|
+
else:
|
|
41
|
+
context["language"] = "unknown"
|
|
42
|
+
context["language_score"] = 0.0
|
|
43
|
+
|
|
44
|
+
except Exception as e:
|
|
45
|
+
# Fallback if model fails
|
|
46
|
+
logger.error(f"Language detection model error: {e}")
|
|
47
|
+
context["language"] = "en" # Fallback safe default
|
|
48
|
+
context["language_score"] = 0.0
|
|
49
|
+
raise e # Re-raise to trigger BaseNode error logging
|
|
50
|
+
|
|
51
|
+
return context
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from ..pipeline.base_node import Node
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger("contentintelpy.nodes.location")
|
|
6
|
+
|
|
7
|
+
class LocationExtractionNode(Node):
|
|
8
|
+
"""
|
|
9
|
+
Refines location data from NER results.
|
|
10
|
+
Extracts 'Location', 'City', 'Country' entities into a dedicated 'locations' key.
|
|
11
|
+
Future upgrade: Add actual Geocoding (Lat/Lon) here.
|
|
12
|
+
"""
|
|
13
|
+
LOCATION_LABELS = {"Location", "City", "Country", "GPE"}
|
|
14
|
+
|
|
15
|
+
def __init__(self):
|
|
16
|
+
super().__init__("LocationExtractionNode")
|
|
17
|
+
|
|
18
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
19
|
+
entities = context.get("entities", [])
|
|
20
|
+
|
|
21
|
+
if not entities:
|
|
22
|
+
return context
|
|
23
|
+
|
|
24
|
+
locations = []
|
|
25
|
+
seen = set()
|
|
26
|
+
|
|
27
|
+
for ent in entities:
|
|
28
|
+
label = ent.get("label")
|
|
29
|
+
text = ent.get("text")
|
|
30
|
+
|
|
31
|
+
if label in self.LOCATION_LABELS and text:
|
|
32
|
+
# Basic deduplication
|
|
33
|
+
clean_text = text.lower().strip()
|
|
34
|
+
if clean_text not in seen:
|
|
35
|
+
locations.append({
|
|
36
|
+
"name": text,
|
|
37
|
+
"type": label,
|
|
38
|
+
# Placeholder for future geocoding extension:
|
|
39
|
+
# "coordinates": None
|
|
40
|
+
})
|
|
41
|
+
seen.add(clean_text)
|
|
42
|
+
|
|
43
|
+
if locations:
|
|
44
|
+
context["locations"] = locations
|
|
45
|
+
logger.debug(f"Extracted {len(locations)} unique locations.")
|
|
46
|
+
|
|
47
|
+
return context
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from ..pipeline.base_node import Node
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from ..utils.model_registry import registry
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("contentintelpy.nodes.ner")
|
|
7
|
+
|
|
8
|
+
class NERNode(Node):
|
|
9
|
+
"""
|
|
10
|
+
Named Entity Recognition using GLiNER.
|
|
11
|
+
Extracts: Person, Organization, Location, Date, etc.
|
|
12
|
+
"""
|
|
13
|
+
LABELS = ["Person", "Organization", "Location", "City", "Country", "Date"]
|
|
14
|
+
|
|
15
|
+
def __init__(self, labels: list = None):
|
|
16
|
+
super().__init__("NERNode")
|
|
17
|
+
self.labels = labels or self.LABELS
|
|
18
|
+
|
|
19
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
20
|
+
text = context.get("text_translated") or context.get("text")
|
|
21
|
+
|
|
22
|
+
if not text:
|
|
23
|
+
return context
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
gliner = registry.get_gliner_model()
|
|
27
|
+
# GLiNER predict_entities returns list of dicts: {'text': '', 'label': '', 'score': float}
|
|
28
|
+
entities = gliner.predict_entities(text, self.labels)
|
|
29
|
+
|
|
30
|
+
# Normalize and serialize
|
|
31
|
+
serialized_entities = []
|
|
32
|
+
for ent in entities:
|
|
33
|
+
serialized_entities.append({
|
|
34
|
+
"text": ent["text"],
|
|
35
|
+
"label": ent["label"],
|
|
36
|
+
"score": float(ent.get("score", 0.0))
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
context["entities"] = serialized_entities
|
|
40
|
+
logger.debug(f"Found {len(serialized_entities)} entities.")
|
|
41
|
+
|
|
42
|
+
except Exception as e:
|
|
43
|
+
logger.error(f"NER failed: {e}")
|
|
44
|
+
context.add_error("NERNode", str(e))
|
|
45
|
+
|
|
46
|
+
return context
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from ..pipeline.base_node import Node
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from ..utils.model_registry import registry
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("contentintelpy.nodes.sentiment")
|
|
7
|
+
|
|
8
|
+
class SentimentNode(Node):
|
|
9
|
+
"""
|
|
10
|
+
Analyzes emotional tone (Positive / Negative / Neutral) using RoBERTa.
|
|
11
|
+
Provides output in both English and Native language (if supported).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
# Simple static mapping for target language labels (MVP approach)
|
|
15
|
+
# In a full version, this could be dynamic or more extensive.
|
|
16
|
+
LABEL_MAP = {
|
|
17
|
+
"hi": {
|
|
18
|
+
"positive": "सकारात्मक",
|
|
19
|
+
"negative": "नकारात्मक",
|
|
20
|
+
"neutral": "तटस्थ"
|
|
21
|
+
},
|
|
22
|
+
"es": {
|
|
23
|
+
"positive": "positivo",
|
|
24
|
+
"negative": "negativo",
|
|
25
|
+
"neutral": "neutral"
|
|
26
|
+
},
|
|
27
|
+
"fr": {
|
|
28
|
+
"positive": "positif",
|
|
29
|
+
"negative": "négatif",
|
|
30
|
+
"neutral": "neutre"
|
|
31
|
+
},
|
|
32
|
+
# Add more as needed
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
def __init__(self):
|
|
36
|
+
super().__init__("SentimentNode")
|
|
37
|
+
|
|
38
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
39
|
+
# Sentiment should run on the TRANSLATED text (English) for model accuracy
|
|
40
|
+
# But we return labels relevant to the original language context if possible.
|
|
41
|
+
text_to_analyze = context.get("text_translated") or context.get("text")
|
|
42
|
+
original_lang = context.get("language", "en")
|
|
43
|
+
|
|
44
|
+
if not text_to_analyze:
|
|
45
|
+
return context
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
analyzer = registry.get_sentiment_pipeline()
|
|
49
|
+
# Truncate to model max length (~512 tokens)
|
|
50
|
+
result = analyzer(text_to_analyze[:512])
|
|
51
|
+
# Result: [{'label': 'positive', 'score': 0.98}]
|
|
52
|
+
|
|
53
|
+
if result and len(result) > 0:
|
|
54
|
+
top = result[0]
|
|
55
|
+
label_en = top['label'].lower() # positive, negative, neutral
|
|
56
|
+
score = top['score']
|
|
57
|
+
|
|
58
|
+
# Resolve Native Label
|
|
59
|
+
label_native = label_en
|
|
60
|
+
if original_lang in self.LABEL_MAP:
|
|
61
|
+
label_native = self.LABEL_MAP[original_lang].get(label_en, label_en)
|
|
62
|
+
|
|
63
|
+
context["sentiment"] = {
|
|
64
|
+
"value": label_native,
|
|
65
|
+
"value_en": label_en,
|
|
66
|
+
"confidence": round(score, 4)
|
|
67
|
+
}
|
|
68
|
+
logger.debug(f"Sentiment: {label_en} ({score:.2f})")
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.error(f"Sentiment analysis failed: {e}")
|
|
72
|
+
context.add_error("SentimentNode", str(e))
|
|
73
|
+
|
|
74
|
+
return context
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from ..pipeline.base_node import Node
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from ..utils.model_registry import registry
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
# Sumy imports
|
|
7
|
+
from sumy.parsers.plaintext import PlaintextParser
|
|
8
|
+
from sumy.nlp.tokenizers import Tokenizer
|
|
9
|
+
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
|
|
10
|
+
from sumy.nlp.stemmers import Stemmer
|
|
11
|
+
from sumy.utils import get_stop_words
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger("contentintelpy.nodes.summarization")
|
|
14
|
+
|
|
15
|
+
class SummarizationNode(Node):
|
|
16
|
+
"""
|
|
17
|
+
Summarizes text.
|
|
18
|
+
Primary: BART (Generic Abstractive) via Transformers.
|
|
19
|
+
Fallback: Sumy (LSA - Extractive).
|
|
20
|
+
"""
|
|
21
|
+
def __init__(self, max_length: int = 150, min_length: int = 40):
|
|
22
|
+
super().__init__("SummarizationNode")
|
|
23
|
+
self.max_length = max_length
|
|
24
|
+
self.min_length = min_length
|
|
25
|
+
|
|
26
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
27
|
+
text = context.get("text_translated") or context.get("text")
|
|
28
|
+
|
|
29
|
+
if not text or len(text.split()) < 30:
|
|
30
|
+
logger.debug("Text too short for summarization.")
|
|
31
|
+
return context
|
|
32
|
+
|
|
33
|
+
summary_text = None
|
|
34
|
+
|
|
35
|
+
# 1. Try BART
|
|
36
|
+
try:
|
|
37
|
+
summarizer = registry.get_summarization_pipeline()
|
|
38
|
+
# Limit input size to avoid model errors on very long text
|
|
39
|
+
input_text = text[:1024 * 4]
|
|
40
|
+
|
|
41
|
+
result = summarizer(input_text, max_length=self.max_length, min_length=self.min_length, do_sample=False)
|
|
42
|
+
if result and len(result) > 0:
|
|
43
|
+
summary_text = result[0]['summary_text']
|
|
44
|
+
except Exception as e:
|
|
45
|
+
logger.warning(f"BART summarization failed: {e}. Falling back to Sumy.")
|
|
46
|
+
context.add_error("SummarizationNode_BART", str(e))
|
|
47
|
+
|
|
48
|
+
# 2. Fallback: Sumy (LSA)
|
|
49
|
+
if not summary_text:
|
|
50
|
+
try:
|
|
51
|
+
parser = PlaintextParser.from_string(text, Tokenizer("english"))
|
|
52
|
+
stemmer = Stemmer("english")
|
|
53
|
+
summarizer = Summarizer(stemmer)
|
|
54
|
+
summarizer.stop_words = get_stop_words("english")
|
|
55
|
+
|
|
56
|
+
# Extract top 3 sentences
|
|
57
|
+
sentences = summarizer(parser.document, 3)
|
|
58
|
+
summary_text = " ".join([str(s) for s in sentences])
|
|
59
|
+
context["summary_method"] = "sumy_lsa"
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.error(f"Sumy fallback failed: {e}")
|
|
62
|
+
context.add_error("SummarizationNode_Sumy", str(e))
|
|
63
|
+
|
|
64
|
+
if summary_text:
|
|
65
|
+
context["summary"] = summary_text
|
|
66
|
+
|
|
67
|
+
return context
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from ..pipeline.base_node import Node
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from ..utils.model_registry import registry
|
|
4
|
+
import logging
|
|
5
|
+
import argostranslate.package
|
|
6
|
+
import argostranslate.translate
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("contentintelpy.nodes.translation")
|
|
9
|
+
|
|
10
|
+
class TranslationNode(Node):
|
|
11
|
+
"""
|
|
12
|
+
Normalizes text to the target language (default 'en').
|
|
13
|
+
Strategy:
|
|
14
|
+
1. Check if translation needed (source != target).
|
|
15
|
+
2. Try NLLB (High Quality, GPU/CPU).
|
|
16
|
+
3. Fallback to ArgosTranslate (Offline, CPU).
|
|
17
|
+
"""
|
|
18
|
+
def __init__(self, target_lang: str = "en", force: bool = False):
|
|
19
|
+
super().__init__("TranslationNode")
|
|
20
|
+
self.target_lang = target_lang
|
|
21
|
+
self.force = force
|
|
22
|
+
|
|
23
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
24
|
+
original_text = context.get("text")
|
|
25
|
+
source_lang = context.get("language", "unknown")
|
|
26
|
+
|
|
27
|
+
# 1. Logic Check: Do we need to translate?
|
|
28
|
+
if not original_text:
|
|
29
|
+
return context
|
|
30
|
+
|
|
31
|
+
if not self.force and source_lang == self.target_lang:
|
|
32
|
+
logger.info(f"Skipping translation: Source is already {source_lang}.")
|
|
33
|
+
context["text_translated"] = original_text
|
|
34
|
+
context["translation_method"] = "skipped"
|
|
35
|
+
return context
|
|
36
|
+
|
|
37
|
+
# 2. Try NLLB (High Quality)
|
|
38
|
+
translated_text = None
|
|
39
|
+
try:
|
|
40
|
+
logger.info("Attempting translation with NLLB...")
|
|
41
|
+
translator = registry.get_translation_pipeline()
|
|
42
|
+
# NLLB expects specific language codes (e.g., 'hin_Deva' for Hindi)
|
|
43
|
+
# For simplicity in this v0.1, we assume the model handles standard ISO codes or auto-detect
|
|
44
|
+
# In a real deep implementation, we'd map ISO->NLLB codes.
|
|
45
|
+
# Transformers pipeline handles source detection often, but target needs spec.
|
|
46
|
+
# However, NLLB pipeline usage is slightly complex.
|
|
47
|
+
# For v0.1.0 reliability, we wrap this in try-except carefully.
|
|
48
|
+
|
|
49
|
+
# Simple direct usage for standard pipeline
|
|
50
|
+
output = translator(original_text, src_lang=source_lang, tgt_lang=self.target_lang, max_length=512)
|
|
51
|
+
# Output format: [{'translation_text': '...'}]
|
|
52
|
+
if output and len(output) > 0:
|
|
53
|
+
translated_text = output[0]['translation_text']
|
|
54
|
+
context["translation_method"] = "nllb"
|
|
55
|
+
except Exception as e:
|
|
56
|
+
logger.warning(f"NLLB translation failed: {e}. Falling back to Argos.")
|
|
57
|
+
context.add_error("TranslationNode_NLLB", str(e))
|
|
58
|
+
|
|
59
|
+
# 3. Fallback: ArgosTranslate (Offline)
|
|
60
|
+
if not translated_text:
|
|
61
|
+
try:
|
|
62
|
+
logger.info("Attempting translation with ArgosTranslate...")
|
|
63
|
+
# Argos requires ensuring packages are installed
|
|
64
|
+
# This is a blocking network call on first run if not present
|
|
65
|
+
# Ideally, we should preload these in ModelRegistry, but Argos manages its own state
|
|
66
|
+
argostranslate.package.update_package_index()
|
|
67
|
+
available_packages = argostranslate.package.get_available_packages()
|
|
68
|
+
package_to_install = next(
|
|
69
|
+
filter(
|
|
70
|
+
lambda x: x.from_code == source_lang and x.to_code == self.target_lang, available_packages
|
|
71
|
+
), None
|
|
72
|
+
)
|
|
73
|
+
if package_to_install:
|
|
74
|
+
argostranslate.package.install_from_path(package_to_install.download())
|
|
75
|
+
|
|
76
|
+
translated_text = argostranslate.translate.translate(original_text, source_lang, self.target_lang)
|
|
77
|
+
context["translation_method"] = "argos"
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.error(f"Argos translation failed: {e}")
|
|
80
|
+
context.add_error("TranslationNode_Argos", str(e))
|
|
81
|
+
|
|
82
|
+
# 4. Final Result
|
|
83
|
+
if translated_text:
|
|
84
|
+
context["text_translated"] = translated_text
|
|
85
|
+
else:
|
|
86
|
+
# Last resort: Keep original
|
|
87
|
+
logger.warning("All translation methods failed. Keeping original text.")
|
|
88
|
+
context["text_translated"] = original_text
|
|
89
|
+
context["translation_method"] = "failed_copy"
|
|
90
|
+
|
|
91
|
+
return context
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any
|
|
3
|
+
from .context import PipelineContext
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
# Configure reusable logger
|
|
7
|
+
logger = logging.getLogger("contentintelpy")
|
|
8
|
+
|
|
9
|
+
class Node(ABC):
|
|
10
|
+
"""
|
|
11
|
+
Base class for all NLP processing nodes.
|
|
12
|
+
Enforces the contract: read from context -> process -> write to context -> handle errors.
|
|
13
|
+
"""
|
|
14
|
+
def __init__(self, name: str):
|
|
15
|
+
self.name = name
|
|
16
|
+
|
|
17
|
+
def run(self, context: PipelineContext) -> PipelineContext:
|
|
18
|
+
"""
|
|
19
|
+
Public execution entry point. Handles error safety.
|
|
20
|
+
DO NOT OVERRIDE THIS. Override `process` instead.
|
|
21
|
+
"""
|
|
22
|
+
try:
|
|
23
|
+
# Execute the core logic
|
|
24
|
+
return self.process(context)
|
|
25
|
+
except Exception as e:
|
|
26
|
+
# Soft failure: Log error and continue
|
|
27
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
28
|
+
logger.error(f"Node '{self.name}' failed: {error_msg}")
|
|
29
|
+
context.add_error(self.name, error_msg)
|
|
30
|
+
return context
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def process(self, context: PipelineContext) -> PipelineContext:
|
|
34
|
+
"""
|
|
35
|
+
Core logic implementation.
|
|
36
|
+
Must mutate context in-place and return it.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
context: The mutable pipeline state.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
The modified context.
|
|
43
|
+
"""
|
|
44
|
+
pass
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from typing import Any, Dict, Optional
|
|
2
|
+
|
|
3
|
+
class PipelineContext:
|
|
4
|
+
"""
|
|
5
|
+
Mutable state container for the NLP pipeline.
|
|
6
|
+
wraps a dictionary and provides helper methods for safe access and error logging.
|
|
7
|
+
"""
|
|
8
|
+
def __init__(self, initial_data: Optional[Dict[str, Any]] = None):
|
|
9
|
+
self._data: Dict[str, Any] = initial_data or {}
|
|
10
|
+
if "errors" not in self._data:
|
|
11
|
+
self._data["errors"] = {}
|
|
12
|
+
|
|
13
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
14
|
+
return self._data.get(key, default)
|
|
15
|
+
|
|
16
|
+
def set(self, key: str, value: Any) -> None:
|
|
17
|
+
self._data[key] = value
|
|
18
|
+
|
|
19
|
+
def add_error(self, node_name: str, error_message: str) -> None:
|
|
20
|
+
"""Log a soft failure for a specific node."""
|
|
21
|
+
if "errors" not in self._data:
|
|
22
|
+
self._data["errors"] = {}
|
|
23
|
+
self._data["errors"][node_name] = error_message
|
|
24
|
+
|
|
25
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
26
|
+
"""Return the raw dictionary state."""
|
|
27
|
+
return self._data
|
|
28
|
+
|
|
29
|
+
def __getitem__(self, key: str) -> Any:
|
|
30
|
+
return self._data[key]
|
|
31
|
+
|
|
32
|
+
def __setitem__(self, key: str, value: Any) -> None:
|
|
33
|
+
self._data[key] = value
|
|
34
|
+
|
|
35
|
+
def __contains__(self, key: str) -> bool:
|
|
36
|
+
return key in self._data
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from typing import List, Dict, Any
|
|
2
|
+
from .base_node import Node
|
|
3
|
+
from .context import PipelineContext
|
|
4
|
+
|
|
5
|
+
class Pipeline:
|
|
6
|
+
"""
|
|
7
|
+
DAG execution engine.
|
|
8
|
+
Runs a tailored sequence of Nodes in strict order.
|
|
9
|
+
"""
|
|
10
|
+
def __init__(self, nodes: List[Node]):
|
|
11
|
+
self.nodes = nodes
|
|
12
|
+
|
|
13
|
+
def run(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
14
|
+
"""
|
|
15
|
+
Execute the pipeline on the input data.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
input_data: Dictionary containing the input (e.g. {"text": "..."})
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
The final dictionary with results and any errors (sparse output).
|
|
22
|
+
"""
|
|
23
|
+
# Initialize context wrapper
|
|
24
|
+
context = PipelineContext(input_data)
|
|
25
|
+
|
|
26
|
+
# distinct linear execution
|
|
27
|
+
for node in self.nodes:
|
|
28
|
+
context = node.run(context)
|
|
29
|
+
|
|
30
|
+
return context.to_dict()
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from ..nodes.ner_node import NERNode
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from typing import List, Dict, Any
|
|
4
|
+
|
|
5
|
+
class NERService:
|
|
6
|
+
"""
|
|
7
|
+
Service-first wrapper for Named Entity Recognition.
|
|
8
|
+
"""
|
|
9
|
+
def __init__(self, labels: List[str] = None):
|
|
10
|
+
self.node = NERNode(labels=labels)
|
|
11
|
+
|
|
12
|
+
def extract(self, text: str) -> List[Dict[str, Any]]:
|
|
13
|
+
"""
|
|
14
|
+
Extract entities from text.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
List of entities [{'text': '...', 'label': '...', 'score': ...}]
|
|
18
|
+
"""
|
|
19
|
+
if not text:
|
|
20
|
+
return []
|
|
21
|
+
|
|
22
|
+
context = PipelineContext({"text": text})
|
|
23
|
+
context = self.node.run(context)
|
|
24
|
+
|
|
25
|
+
return context.get("entities", [])
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from ..nodes.sentiment_node import SentimentNode
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from typing import Dict, Any
|
|
4
|
+
|
|
5
|
+
class SentimentService:
|
|
6
|
+
"""
|
|
7
|
+
Service-first wrapper for Sentiment Analysis.
|
|
8
|
+
Simplifies usage to a single method call.
|
|
9
|
+
"""
|
|
10
|
+
def __init__(self):
|
|
11
|
+
self.node = SentimentNode()
|
|
12
|
+
|
|
13
|
+
def analyze(self, text: str) -> Dict[str, Any]:
|
|
14
|
+
"""
|
|
15
|
+
Analyze the sentiment of the text.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
text: Input text (English preferred for best accuracy, but multilingual supported via Node labels)
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Dict containing 'value', 'value_en', 'confidence'.
|
|
22
|
+
"""
|
|
23
|
+
if not text:
|
|
24
|
+
return {}
|
|
25
|
+
|
|
26
|
+
# Create ad-hoc context
|
|
27
|
+
context = PipelineContext({"text": text, "language": "en"})
|
|
28
|
+
|
|
29
|
+
# Execute Node
|
|
30
|
+
# We manually run the node. The node handles errors internally by logging/soft-failing.
|
|
31
|
+
context = self.node.run(context)
|
|
32
|
+
|
|
33
|
+
# Return the specific output or empty dict if failed
|
|
34
|
+
return context.get("sentiment", {})
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from ..nodes.summarization_node import SummarizationNode
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
class SummarizationService:
|
|
6
|
+
"""
|
|
7
|
+
Service-first wrapper for Summarization.
|
|
8
|
+
"""
|
|
9
|
+
def __init__(self, max_length: int = 130, min_length: int = 30):
|
|
10
|
+
self.node = SummarizationNode(max_length=max_length, min_length=min_length)
|
|
11
|
+
|
|
12
|
+
def summarize(self, text: str) -> str:
|
|
13
|
+
"""
|
|
14
|
+
Generate a summary of the text.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Summary string.
|
|
18
|
+
"""
|
|
19
|
+
if not text:
|
|
20
|
+
return ""
|
|
21
|
+
|
|
22
|
+
context = PipelineContext({"text": text})
|
|
23
|
+
context = self.node.run(context)
|
|
24
|
+
|
|
25
|
+
return context.get("summary", "")
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from ..nodes.translation_node import TranslationNode
|
|
2
|
+
from ..pipeline.context import PipelineContext
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
class TranslationService:
|
|
6
|
+
"""
|
|
7
|
+
Service-first wrapper for Translation.
|
|
8
|
+
Allows quick translation of text to any target language.
|
|
9
|
+
"""
|
|
10
|
+
def __init__(self):
|
|
11
|
+
# We don't init node here because target_lang changes per request often,
|
|
12
|
+
# or we default to 'en'.
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
def translate(self, text: str, target: str = "en", source: str = "unknown") -> str:
|
|
16
|
+
"""
|
|
17
|
+
Translate text.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
text: Content to translate.
|
|
21
|
+
target: Target language code (default 'en').
|
|
22
|
+
source: Source language code (optional, optimizations if provided).
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Translated string. Returns original if failed.
|
|
26
|
+
"""
|
|
27
|
+
if not text:
|
|
28
|
+
return ""
|
|
29
|
+
|
|
30
|
+
node = TranslationNode(target_lang=target)
|
|
31
|
+
context = PipelineContext({
|
|
32
|
+
"text": text,
|
|
33
|
+
"language": source
|
|
34
|
+
})
|
|
35
|
+
|
|
36
|
+
context = node.run(context)
|
|
37
|
+
|
|
38
|
+
return context.get("text_translated", text)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import threading
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
# Suppress HuggingFace warnings for cleaner logs
|
|
7
|
+
warnings.filterwarnings("ignore", category=UserWarning)
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger("contentintelpy.registry")
|
|
10
|
+
|
|
11
|
+
class ModelRegistry:
|
|
12
|
+
"""
|
|
13
|
+
Centralized registry for ML models.
|
|
14
|
+
Implements:
|
|
15
|
+
- Singleton access
|
|
16
|
+
- Lazy loading (models load only when requested)
|
|
17
|
+
- Thread safety (locks for concurrent access)
|
|
18
|
+
- Caching
|
|
19
|
+
"""
|
|
20
|
+
_instance = None
|
|
21
|
+
_lock = threading.Lock()
|
|
22
|
+
_models = {}
|
|
23
|
+
|
|
24
|
+
def __new__(cls):
|
|
25
|
+
if cls._instance is None:
|
|
26
|
+
with cls._lock:
|
|
27
|
+
if cls._instance is None:
|
|
28
|
+
cls._instance = super(ModelRegistry, cls).__new__(cls)
|
|
29
|
+
return cls._instance
|
|
30
|
+
|
|
31
|
+
def _load_if_missing(self, key: str, loader_func: callable) -> Any:
|
|
32
|
+
"""
|
|
33
|
+
Generic thread-safe lazy loader.
|
|
34
|
+
"""
|
|
35
|
+
if key not in self._models:
|
|
36
|
+
with self._lock:
|
|
37
|
+
# Double-check inside lock
|
|
38
|
+
if key not in self._models:
|
|
39
|
+
try:
|
|
40
|
+
logger.info(f"Loading resource '{key}'... (This may take a moment)")
|
|
41
|
+
self._models[key] = loader_func()
|
|
42
|
+
logger.info(f"Successfully loaded '{key}'.")
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.error(f"Failed to load '{key}': {e}")
|
|
45
|
+
raise e
|
|
46
|
+
return self._models[key]
|
|
47
|
+
|
|
48
|
+
# --------------------------------------------------------------------------
|
|
49
|
+
# Sentiment Analysis (RoBERTa)
|
|
50
|
+
# --------------------------------------------------------------------------
|
|
51
|
+
def get_sentiment_pipeline(self):
|
|
52
|
+
def _loader():
|
|
53
|
+
from transformers import pipeline
|
|
54
|
+
# Use a high-quality multilingual or English sentiment model
|
|
55
|
+
# CardiffNLP is standard for Twitter-like text, widely used
|
|
56
|
+
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
|
|
57
|
+
return pipeline("sentiment-analysis", model=model_name, tokenizer=model_name, top_k=None)
|
|
58
|
+
|
|
59
|
+
return self._load_if_missing("sentiment", _loader)
|
|
60
|
+
|
|
61
|
+
# --------------------------------------------------------------------------
|
|
62
|
+
# Translation (NLLB - GPU/Heavy)
|
|
63
|
+
# --------------------------------------------------------------------------
|
|
64
|
+
def get_translation_pipeline(self):
|
|
65
|
+
def _loader():
|
|
66
|
+
from transformers import pipeline
|
|
67
|
+
# NLLB-200 Distilled (600M) is a good balance of size/quality
|
|
68
|
+
model_name = "facebook/nllb-200-distilled-600M"
|
|
69
|
+
return pipeline("translation", model=model_name, tokenizer=model_name)
|
|
70
|
+
|
|
71
|
+
return self._load_if_missing("translation_nllb", _loader)
|
|
72
|
+
|
|
73
|
+
# --------------------------------------------------------------------------
|
|
74
|
+
# NER (GLiNER)
|
|
75
|
+
# --------------------------------------------------------------------------
|
|
76
|
+
def get_gliner_model(self):
|
|
77
|
+
def _loader():
|
|
78
|
+
from gliner import GLiNER
|
|
79
|
+
# Standard GLiNER model
|
|
80
|
+
return GLiNER.from_pretrained("urchade/gliner_large-v2.1")
|
|
81
|
+
|
|
82
|
+
return self._load_if_missing("ner_gliner", _loader)
|
|
83
|
+
|
|
84
|
+
# --------------------------------------------------------------------------
|
|
85
|
+
# Zero-Shot Classification (BART)
|
|
86
|
+
# --------------------------------------------------------------------------
|
|
87
|
+
def get_classifier_pipeline(self):
|
|
88
|
+
def _loader():
|
|
89
|
+
from transformers import pipeline
|
|
90
|
+
return pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
|
91
|
+
|
|
92
|
+
return self._load_if_missing("classification_bart", _loader)
|
|
93
|
+
|
|
94
|
+
# --------------------------------------------------------------------------
|
|
95
|
+
# Summarization (BART)
|
|
96
|
+
# --------------------------------------------------------------------------
|
|
97
|
+
def get_summarization_pipeline(self):
|
|
98
|
+
def _loader():
|
|
99
|
+
from transformers import pipeline
|
|
100
|
+
return pipeline("summarization", model="facebook/bart-large-cnn")
|
|
101
|
+
|
|
102
|
+
return self._load_if_missing("summarization", _loader)
|
|
103
|
+
|
|
104
|
+
# --------------------------------------------------------------------------
|
|
105
|
+
# Language Detection (Transformers)
|
|
106
|
+
# --------------------------------------------------------------------------
|
|
107
|
+
def get_language_detector(self):
|
|
108
|
+
def _loader():
|
|
109
|
+
from transformers import pipeline
|
|
110
|
+
return pipeline("text-classification", model="qanastek/51-languages-classifier")
|
|
111
|
+
|
|
112
|
+
return self._load_if_missing("language_detection", _loader)
|
|
113
|
+
|
|
114
|
+
# --------------------------------------------------------------------------
|
|
115
|
+
# Keyword Extraction (Embeddings)
|
|
116
|
+
# --------------------------------------------------------------------------
|
|
117
|
+
def get_embedding_model(self):
|
|
118
|
+
def _loader():
|
|
119
|
+
from sentence_transformers import SentenceTransformer
|
|
120
|
+
# Fast, effective embedding model for semantic similarity
|
|
121
|
+
return SentenceTransformer('all-MiniLM-L6-v2')
|
|
122
|
+
|
|
123
|
+
return self._load_if_missing("keywords_embedding", _loader)
|
|
124
|
+
|
|
125
|
+
# Global accessor
|
|
126
|
+
registry = ModelRegistry()
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: contentintelpy
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Production-grade NLP library for unified content intelligence.
|
|
5
|
+
Author-email: Ronit Fulari <ronitfulari31@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy>=1.24.0
|
|
14
|
+
Requires-Dist: tqdm>=4.66.0
|
|
15
|
+
Provides-Extra: core
|
|
16
|
+
Requires-Dist: transformers<5.0.0,>=4.30.0; extra == "core"
|
|
17
|
+
Requires-Dist: torch<3.0.0,>=2.0.0; extra == "core"
|
|
18
|
+
Requires-Dist: sentence-transformers>=2.2.0; extra == "core"
|
|
19
|
+
Provides-Extra: ner
|
|
20
|
+
Requires-Dist: spacy>=3.7.0; extra == "ner"
|
|
21
|
+
Requires-Dist: gliner>=0.1.0; extra == "ner"
|
|
22
|
+
Provides-Extra: translation
|
|
23
|
+
Requires-Dist: argostranslate>=1.9.0; extra == "translation"
|
|
24
|
+
Provides-Extra: summarization
|
|
25
|
+
Requires-Dist: sumy>=0.11.0; extra == "summarization"
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest; extra == "dev"
|
|
28
|
+
Requires-Dist: black; extra == "dev"
|
|
29
|
+
Requires-Dist: isort; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# contentintelpy
|
|
33
|
+
|
|
34
|
+
**Production-grade NLP library for unified content intelligence.**
|
|
35
|
+
|
|
36
|
+
`contentintelpy` provides a unified, DAG-based engine for multilingual sentiment analysis, NER, translation, and summarization using real transformer models (RoBERTa, GLiNER, NLLB).
|
|
37
|
+
|
|
38
|
+
## Features
|
|
39
|
+
|
|
40
|
+
- **Real Models**: No heuristics. Uses State-of-the-Art Transformers.
|
|
41
|
+
- Sentiment: RoBERTa
|
|
42
|
+
- NER: GLiNER
|
|
43
|
+
- Translation: NLLB (GPU) + ArgosTranslate (Offline CPU)
|
|
44
|
+
- **Hybrid Execution**: Models download on first run (lazy-loaded). Offline fallback available.
|
|
45
|
+
- **Deterministic Pipelines**: DAG-based execution guarantees order.
|
|
46
|
+
- **Dual API**:
|
|
47
|
+
- **Pipeline-first** for complex workflows.
|
|
48
|
+
- **Service-first** for quick scripts.
|
|
49
|
+
- **Production Ready**: Thread-safe, standard error handling, sparse outputs.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
Install the base library:
|
|
54
|
+
```bash
|
|
55
|
+
pip install contentintelpy
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Optional Dependencies (Recommended)
|
|
59
|
+
Since the library uses heavy ML models, you should install the specific components you need:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# For all core features
|
|
63
|
+
pip install "contentintelpy[core,ner,translation,summarization]"
|
|
64
|
+
|
|
65
|
+
# For development
|
|
66
|
+
pip install "contentintelpy[dev]"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
> [!IMPORTANT]
|
|
70
|
+
> **spaCy Model Requirement**
|
|
71
|
+
> If you use NER or language features, you must install a spaCy model manually:
|
|
72
|
+
> ```bash
|
|
73
|
+
> python -m spacy download en_core_web_sm
|
|
74
|
+
> ```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Quick Start
|
|
79
|
+
|
|
80
|
+
Ideal for simple tasks in notebooks or scripts.
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from contentintelpy import SentimentService, TranslationService
|
|
84
|
+
|
|
85
|
+
# Sentiment
|
|
86
|
+
service = SentimentService()
|
|
87
|
+
result = service.analyze("This library is amazing!")
|
|
88
|
+
print(result)
|
|
89
|
+
# {'value': 'positive', 'confidence': 0.99, ...}
|
|
90
|
+
|
|
91
|
+
# Translation
|
|
92
|
+
translator = TranslationService()
|
|
93
|
+
text = translator.translate("Hola mundo", target="en")
|
|
94
|
+
print(text)
|
|
95
|
+
# "Hello world"
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Production Usage (Pipeline-First)
|
|
99
|
+
|
|
100
|
+
Recommended for Backends, APIs, and Data Pipelines.
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
import contentintelpy as ci
|
|
104
|
+
|
|
105
|
+
# 1. Create the canonical pipeline
|
|
106
|
+
pipeline = ci.create_default_pipeline()
|
|
107
|
+
|
|
108
|
+
# 2. Run it (Thread-safe)
|
|
109
|
+
result = pipeline.run({
|
|
110
|
+
"text": "गूगल ने बेंगलुरु में नया कार्यालय खोला"
|
|
111
|
+
})
|
|
112
|
+
|
|
113
|
+
# 3. Access Sparse Output
|
|
114
|
+
print(result)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
**Output Example:**
|
|
118
|
+
```json
|
|
119
|
+
{
|
|
120
|
+
"text": "...",
|
|
121
|
+
"text_translated": "Google opened a new office in Bengaluru",
|
|
122
|
+
"language": "hi",
|
|
123
|
+
"entities": [
|
|
124
|
+
{"text": "Google", "label": "ORG"},
|
|
125
|
+
{"text": "Bengaluru", "label": "LOC"}
|
|
126
|
+
],
|
|
127
|
+
"sentiment": {
|
|
128
|
+
"value": "neutral",
|
|
129
|
+
"value_en": "neutral",
|
|
130
|
+
"confidence": 0.95
|
|
131
|
+
},
|
|
132
|
+
"summary": "..."
|
|
133
|
+
}
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Error Handling
|
|
137
|
+
|
|
138
|
+
Nodes **never crash** the pipeline. Errors are collected in `errors` dict.
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
{
|
|
142
|
+
"text": "...",
|
|
143
|
+
"errors": {
|
|
144
|
+
"TranslationNode": "Model download failed: Connection error"
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Architecture
|
|
150
|
+
|
|
151
|
+
This library is pure logic. It does **NOT** contain:
|
|
152
|
+
- Flask / FastAPI routes
|
|
153
|
+
- Database models
|
|
154
|
+
- Authentication
|
|
155
|
+
|
|
156
|
+
It is designed to be **consumed** by your backend application.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
contentintelpy/__init__.py,sha256=HSJcTru6PksguTH52iFj7ORhqSuHXIDHMQotj5KMmBc,1977
|
|
2
|
+
contentintelpy/nodes/classification_node.py,sha256=ivfAHdYXZU-5eVbtgxD94_TiRHcq-mJg4ukOc7KqwXU,2116
|
|
3
|
+
contentintelpy/nodes/keyword_extract_node.py,sha256=g_oERVXfE2VYVqFeqT7J2DIq-By1qXP7XsVOQc5R9mk,2897
|
|
4
|
+
contentintelpy/nodes/language_node.py,sha256=sKRa65kLrb1IRYGrkT82tu8LgdhIXdN5EwhUrH6pSqI,1971
|
|
5
|
+
contentintelpy/nodes/location_node.py,sha256=U3YQ31KclWNeoyrorodBAzAEd7zLmI31Deu72Viw1M0,1579
|
|
6
|
+
contentintelpy/nodes/ner_node.py,sha256=8DRg7NVpz8ZXcobgwYZsWkNOvaFfIj_ZEWG8wJckqus,1632
|
|
7
|
+
contentintelpy/nodes/sentiment_node.py,sha256=oFuw1Z0d6f4BWSYtnp8UN0gMCvL3nl5b4h68t6qv-cQ,2706
|
|
8
|
+
contentintelpy/nodes/summarization_node.py,sha256=kGLM4ssVd01h5hm4Cc5xpg73feCumg6b_hQRAi5gos4,2626
|
|
9
|
+
contentintelpy/nodes/translation_node.py,sha256=vd_RmkKmm07TynfKQdPvSwXeaG2sA8v28iADN1yzbmc,4325
|
|
10
|
+
contentintelpy/pipeline/base_node.py,sha256=hYLx2yAURpbmTr9x4kG8qVIlNI1Q0UJckBltW5LJl-o,1394
|
|
11
|
+
contentintelpy/pipeline/context.py,sha256=u_YsEe4oi-A6MM9igtQ0cOeX88fRd_Uj9umU4040W0E,1257
|
|
12
|
+
contentintelpy/pipeline/pipeline.py,sha256=gTgRcF34KxAJMxtac7wHdesD33q3CIP9hncvILHQ-3c,888
|
|
13
|
+
contentintelpy/services/ner_service.py,sha256=7-sEAqxYRpVksd-sZ5CPgAq3HfVeeb0OaRd0YPIqzPs,737
|
|
14
|
+
contentintelpy/services/sentiment_service.py,sha256=Yc6u0l8m_uN5ZxgUMr9DQziwi50cMlTZuaAOS8A7pJc,1130
|
|
15
|
+
contentintelpy/services/summarization_service.py,sha256=XK3vAGGoQS1dXxaO4nKjyrFlWwN_wZKY2qFNcDJ9IIM,748
|
|
16
|
+
contentintelpy/services/translation_service.py,sha256=6yNLLJ7mAE7ptHvprX1JUoUN-65Ot7ZdTszqqxMY1TA,1191
|
|
17
|
+
contentintelpy/utils/model_registry.py,sha256=OyixstAVsvQ-nkoICeZykdwcPDtuBZRtPHWmoIjfi2o,5344
|
|
18
|
+
contentintelpy-0.1.0.dist-info/licenses/LICENSE,sha256=lZ8hT4isGfdFVxdD7gDRnt3RJqyrkO1L5GseyN3A9hM,1092
|
|
19
|
+
contentintelpy-0.1.0.dist-info/METADATA,sha256=uqVJA361j3FW1xVekYg5dvbXXaOr2zOWZ073ks82lXM,4271
|
|
20
|
+
contentintelpy-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
21
|
+
contentintelpy-0.1.0.dist-info/top_level.txt,sha256=sxoE-r2-frUi3qwADEiYcFFxZW5hMI1Mjw87hcGMulQ,15
|
|
22
|
+
contentintelpy-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 ContentIntelPy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
contentintelpy
|