hinglish-nlp 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hinglish_nlp-0.2.0/PKG-INFO +178 -0
- hinglish_nlp-0.2.0/README.md +160 -0
- hinglish_nlp-0.2.0/hinglish/__init__.py +38 -0
- hinglish_nlp-0.2.0/hinglish/analyzer.py +368 -0
- hinglish_nlp-0.2.0/hinglish_nlp.egg-info/PKG-INFO +178 -0
- hinglish_nlp-0.2.0/hinglish_nlp.egg-info/SOURCES.txt +9 -0
- hinglish_nlp-0.2.0/hinglish_nlp.egg-info/dependency_links.txt +1 -0
- hinglish_nlp-0.2.0/hinglish_nlp.egg-info/requires.txt +1 -0
- hinglish_nlp-0.2.0/hinglish_nlp.egg-info/top_level.txt +2 -0
- hinglish_nlp-0.2.0/pyproject.toml +28 -0
- hinglish_nlp-0.2.0/setup.cfg +4 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hinglish-nlp
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Hinglish (Roman Hindi + English) NLP toolkit - Sentiment, Emotion, Sarcasm & more
|
|
5
|
+
Author-email: Lalit <official.lalitpal08@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Lalit2206/hinglish-nlp
|
|
8
|
+
Project-URL: Repository, https://github.com/Lalit2206/hinglish-nlp
|
|
9
|
+
Keywords: hinglish,nlp,sentiment,hindi,roman-hindi,text-analysis
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
14
|
+
Classifier: Natural Language :: Hindi
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: pydantic>=2.0
|
|
18
|
+
|
|
19
|
+
# hinglish-nlp 🇮🇳
|
|
20
|
+
|
|
21
|
+
A powerful NLP toolkit for **Hinglish** (Roman Hindi + English) text analysis.
|
|
22
|
+
|
|
23
|
+
[](https://badge.fury.io/py/hinglish-nlp)
|
|
24
|
+
[](https://www.python.org/downloads/)
|
|
25
|
+
[](https://opensource.org/licenses/MIT)
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install hinglish-nlp
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Features
|
|
38
|
+
|
|
39
|
+
| Feature | Description |
|
|
40
|
+
|--------|-------------|
|
|
41
|
+
| ✅ Sentiment Analysis | Positive / Negative / Neutral / Mixed |
|
|
42
|
+
| ✅ Emotion Detection | Joy, Anger, Sadness, Fear, Surprise, Disgust |
|
|
43
|
+
| ✅ Sarcasm Detection | Pattern + contradiction based |
|
|
44
|
+
| ✅ Language Mix Detection | Hinglish vs English ratio |
|
|
45
|
+
| ✅ Transliteration | Roman Hindi → Devanagari |
|
|
46
|
+
| ✅ Batch Processing | Multiple texts at once |
|
|
47
|
+
| ✅ Key Phrase Extraction | Important phrases from text |
|
|
48
|
+
| ✅ Intensity Score | 0.0 to 1.0 scale |
|
|
49
|
+
| ✅ Confidence Score | How sure the model is |
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Usage
|
|
54
|
+
|
|
55
|
+
### Basic Sentiment Analysis
|
|
56
|
+
```python
|
|
57
|
+
from hinglish import analyze
|
|
58
|
+
|
|
59
|
+
result = analyze("yaar bahut mast movie thi!")
|
|
60
|
+
print(result["mood"]) # positive
|
|
61
|
+
print(result["emoji"]) # 😊
|
|
62
|
+
print(result["intensity"]) # 0.45
|
|
63
|
+
print(result["confidence"]) # 0.75
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Emotion Detection
|
|
67
|
+
```python
|
|
68
|
+
from hinglish import detect_emotion
|
|
69
|
+
|
|
70
|
+
emotions = detect_emotion("mujhe bahut gussa aa raha hai!")
|
|
71
|
+
print(emotions) # {'anger': 0.35}
|
|
72
|
+
|
|
73
|
+
emotions = detect_emotion("aaj bahut khushi hui yaar!")
|
|
74
|
+
print(emotions) # {'joy': 0.7}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Sarcasm Detection
|
|
78
|
+
```python
|
|
79
|
+
from hinglish import is_sarcastic
|
|
80
|
+
|
|
81
|
+
result = is_sarcastic("haan bilkul, bahut accha hai na!!")
|
|
82
|
+
print(result)
|
|
83
|
+
# {'is_sarcastic': True, 'confidence': 0.6}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Language Mix Detection
|
|
87
|
+
```python
|
|
88
|
+
from hinglish import detect_language
|
|
89
|
+
|
|
90
|
+
mix = detect_language("yaar ye movie bahut boring thi")
|
|
91
|
+
print(mix)
|
|
92
|
+
# {'hinglish': 0.5, 'english': 0.33, 'unknown': 0.17}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Transliteration (Roman → Devanagari)
|
|
96
|
+
```python
|
|
97
|
+
from hinglish import transliterate
|
|
98
|
+
|
|
99
|
+
text = transliterate("mera naam lalit hai")
|
|
100
|
+
print(text) # मेरा नाम ललित है
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Batch Processing
|
|
104
|
+
```python
|
|
105
|
+
from hinglish import analyze_batch
|
|
106
|
+
|
|
107
|
+
texts = [
|
|
108
|
+
"yaar mast movie thi!",
|
|
109
|
+
"bilkul bakwaas tha yaar",
|
|
110
|
+
"theek thak tha, kuch khaas nahi"
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
results = analyze_batch(texts)
|
|
114
|
+
for r in results:
|
|
115
|
+
print(r["mood"], r["emoji"])
|
|
116
|
+
# positive 😊
|
|
117
|
+
# negative 😠
|
|
118
|
+
# neutral 😐
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Full Analysis
|
|
122
|
+
```python
|
|
123
|
+
from hinglish import analyze
|
|
124
|
+
|
|
125
|
+
result = analyze("Phone ki battery toh bekar hai but camera mast hai")
|
|
126
|
+
print(result)
|
|
127
|
+
# {
|
|
128
|
+
# 'mood': 'mixed',
|
|
129
|
+
# 'intensity': 0.3,
|
|
130
|
+
# 'confidence': 0.75,
|
|
131
|
+
# 'emoji': '🤨',
|
|
132
|
+
# 'sentiment': 'mixed',
|
|
133
|
+
# 'key_phrases': ['Phone ki battery toh bekar hai but camera mast hai'],
|
|
134
|
+
# 'sarcasm': False,
|
|
135
|
+
# 'sarcasm_confidence': 0.0,
|
|
136
|
+
# 'language_mix': {'hinglish': 0.36, 'english': 0.55, 'unknown': 0.09},
|
|
137
|
+
# 'category': 'mixed',
|
|
138
|
+
# 'summary': 'A detailed Hinglish message expressing mixed sentiment...',
|
|
139
|
+
# 'emotions': {'disgust': 0.35},
|
|
140
|
+
# 'word_count': 11,
|
|
141
|
+
# 'positive_words_found': ['mast'],
|
|
142
|
+
# 'negative_words_found': ['bekar'],
|
|
143
|
+
# 'transliteration': 'Phone की battery तो bekar है but camera मस्त है'
|
|
144
|
+
# }
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Output Fields
|
|
150
|
+
|
|
151
|
+
| Field | Type | Description |
|
|
152
|
+
|-------|------|-------------|
|
|
153
|
+
| `mood` | str | positive / negative / neutral / mixed |
|
|
154
|
+
| `intensity` | float | 0.0 – 1.0 |
|
|
155
|
+
| `confidence` | float | 0.0 – 1.0 |
|
|
156
|
+
| `emoji` | str | Visual mood indicator |
|
|
157
|
+
| `sentiment` | str | Same as mood |
|
|
158
|
+
| `key_phrases` | list | Important phrases |
|
|
159
|
+
| `sarcasm` | bool | Is text sarcastic? |
|
|
160
|
+
| `sarcasm_confidence` | float | Sarcasm confidence score |
|
|
161
|
+
| `language_mix` | dict | hinglish / english / unknown ratio |
|
|
162
|
+
| `category` | str | praise / complaint / casual / mixed |
|
|
163
|
+
| `summary` | str | Short summary of the text |
|
|
164
|
+
| `emotions` | dict | Detected emotions with scores |
|
|
165
|
+
| `word_count` | int | Total word count |
|
|
166
|
+
| `positive_words_found` | list | Positive words detected |
|
|
167
|
+
| `negative_words_found` | list | Negative words detected |
|
|
168
|
+
| `transliteration` | str | Roman → Devanagari |
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Author
|
|
173
|
+
|
|
174
|
+
**Lalit** — [lalitpal2206](https://pypi.org/user/lalitpal2206/)
|
|
175
|
+
|
|
176
|
+
## License
|
|
177
|
+
|
|
178
|
+
MIT License
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# hinglish-nlp 🇮🇳
|
|
2
|
+
|
|
3
|
+
A powerful NLP toolkit for **Hinglish** (Roman Hindi + English) text analysis.
|
|
4
|
+
|
|
5
|
+
[](https://badge.fury.io/py/hinglish-nlp)
|
|
6
|
+
[](https://www.python.org/downloads/)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install hinglish-nlp
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
| Feature | Description |
|
|
22
|
+
|--------|-------------|
|
|
23
|
+
| ✅ Sentiment Analysis | Positive / Negative / Neutral / Mixed |
|
|
24
|
+
| ✅ Emotion Detection | Joy, Anger, Sadness, Fear, Surprise, Disgust |
|
|
25
|
+
| ✅ Sarcasm Detection | Pattern + contradiction based |
|
|
26
|
+
| ✅ Language Mix Detection | Hinglish vs English ratio |
|
|
27
|
+
| ✅ Transliteration | Roman Hindi → Devanagari |
|
|
28
|
+
| ✅ Batch Processing | Multiple texts at once |
|
|
29
|
+
| ✅ Key Phrase Extraction | Important phrases from text |
|
|
30
|
+
| ✅ Intensity Score | 0.0 to 1.0 scale |
|
|
31
|
+
| ✅ Confidence Score | How sure the model is |
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
### Basic Sentiment Analysis
|
|
38
|
+
```python
|
|
39
|
+
from hinglish import analyze
|
|
40
|
+
|
|
41
|
+
result = analyze("yaar bahut mast movie thi!")
|
|
42
|
+
print(result["mood"]) # positive
|
|
43
|
+
print(result["emoji"]) # 😊
|
|
44
|
+
print(result["intensity"]) # 0.45
|
|
45
|
+
print(result["confidence"]) # 0.75
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Emotion Detection
|
|
49
|
+
```python
|
|
50
|
+
from hinglish import detect_emotion
|
|
51
|
+
|
|
52
|
+
emotions = detect_emotion("mujhe bahut gussa aa raha hai!")
|
|
53
|
+
print(emotions) # {'anger': 0.35}
|
|
54
|
+
|
|
55
|
+
emotions = detect_emotion("aaj bahut khushi hui yaar!")
|
|
56
|
+
print(emotions) # {'joy': 0.7}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Sarcasm Detection
|
|
60
|
+
```python
|
|
61
|
+
from hinglish import is_sarcastic
|
|
62
|
+
|
|
63
|
+
result = is_sarcastic("haan bilkul, bahut accha hai na!!")
|
|
64
|
+
print(result)
|
|
65
|
+
# {'is_sarcastic': True, 'confidence': 0.6}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Language Mix Detection
|
|
69
|
+
```python
|
|
70
|
+
from hinglish import detect_language
|
|
71
|
+
|
|
72
|
+
mix = detect_language("yaar ye movie bahut boring thi")
|
|
73
|
+
print(mix)
|
|
74
|
+
# {'hinglish': 0.5, 'english': 0.33, 'unknown': 0.17}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Transliteration (Roman → Devanagari)
|
|
78
|
+
```python
|
|
79
|
+
from hinglish import transliterate
|
|
80
|
+
|
|
81
|
+
text = transliterate("mera naam lalit hai")
|
|
82
|
+
print(text) # मेरा नाम ललित है
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Batch Processing
|
|
86
|
+
```python
|
|
87
|
+
from hinglish import analyze_batch
|
|
88
|
+
|
|
89
|
+
texts = [
|
|
90
|
+
"yaar mast movie thi!",
|
|
91
|
+
"bilkul bakwaas tha yaar",
|
|
92
|
+
"theek thak tha, kuch khaas nahi"
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
results = analyze_batch(texts)
|
|
96
|
+
for r in results:
|
|
97
|
+
print(r["mood"], r["emoji"])
|
|
98
|
+
# positive 😊
|
|
99
|
+
# negative 😠
|
|
100
|
+
# neutral 😐
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Full Analysis
|
|
104
|
+
```python
|
|
105
|
+
from hinglish import analyze
|
|
106
|
+
|
|
107
|
+
result = analyze("Phone ki battery toh bekar hai but camera mast hai")
|
|
108
|
+
print(result)
|
|
109
|
+
# {
|
|
110
|
+
# 'mood': 'mixed',
|
|
111
|
+
# 'intensity': 0.3,
|
|
112
|
+
# 'confidence': 0.75,
|
|
113
|
+
# 'emoji': '🤨',
|
|
114
|
+
# 'sentiment': 'mixed',
|
|
115
|
+
# 'key_phrases': ['Phone ki battery toh bekar hai but camera mast hai'],
|
|
116
|
+
# 'sarcasm': False,
|
|
117
|
+
# 'sarcasm_confidence': 0.0,
|
|
118
|
+
# 'language_mix': {'hinglish': 0.36, 'english': 0.55, 'unknown': 0.09},
|
|
119
|
+
# 'category': 'mixed',
|
|
120
|
+
# 'summary': 'A detailed Hinglish message expressing mixed sentiment...',
|
|
121
|
+
# 'emotions': {'disgust': 0.35},
|
|
122
|
+
# 'word_count': 11,
|
|
123
|
+
# 'positive_words_found': ['mast'],
|
|
124
|
+
# 'negative_words_found': ['bekar'],
|
|
125
|
+
# 'transliteration': 'Phone की battery तो bekar है but camera मस्त है'
|
|
126
|
+
# }
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Output Fields
|
|
132
|
+
|
|
133
|
+
| Field | Type | Description |
|
|
134
|
+
|-------|------|-------------|
|
|
135
|
+
| `mood` | str | positive / negative / neutral / mixed |
|
|
136
|
+
| `intensity` | float | 0.0 – 1.0 |
|
|
137
|
+
| `confidence` | float | 0.0 – 1.0 |
|
|
138
|
+
| `emoji` | str | Visual mood indicator |
|
|
139
|
+
| `sentiment` | str | Same as mood |
|
|
140
|
+
| `key_phrases` | list | Important phrases |
|
|
141
|
+
| `sarcasm` | bool | Is text sarcastic? |
|
|
142
|
+
| `sarcasm_confidence` | float | Sarcasm confidence score |
|
|
143
|
+
| `language_mix` | dict | hinglish / english / unknown ratio |
|
|
144
|
+
| `category` | str | praise / complaint / casual / mixed |
|
|
145
|
+
| `summary` | str | Short summary of the text |
|
|
146
|
+
| `emotions` | dict | Detected emotions with scores |
|
|
147
|
+
| `word_count` | int | Total word count |
|
|
148
|
+
| `positive_words_found` | list | Positive words detected |
|
|
149
|
+
| `negative_words_found` | list | Negative words detected |
|
|
150
|
+
| `transliteration` | str | Roman → Devanagari |
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## Author
|
|
155
|
+
|
|
156
|
+
**Lalit** — [lalitpal2206](https://pypi.org/user/lalitpal2206/)
|
|
157
|
+
|
|
158
|
+
## License
|
|
159
|
+
|
|
160
|
+
MIT License
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
hinglish-nlp
|
|
3
|
+
============
|
|
4
|
+
A powerful NLP toolkit for Hinglish (Roman Hindi + English) text analysis.
|
|
5
|
+
|
|
6
|
+
Quick start
|
|
7
|
+
-----------
|
|
8
|
+
>>> from hinglish import analyze
|
|
9
|
+
>>> result = analyze("yaar bahut mast movie thi!")
|
|
10
|
+
>>> print(result["mood"]) # positive
|
|
11
|
+
>>> print(result["emotions"]) # {'joy': 0.35}
|
|
12
|
+
|
|
13
|
+
>>> from hinglish import analyze_batch, transliterate, detect_emotion
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from .analyzer import (
|
|
17
|
+
analyze,
|
|
18
|
+
analyze_batch,
|
|
19
|
+
transliterate,
|
|
20
|
+
detect_language,
|
|
21
|
+
detect_emotion,
|
|
22
|
+
is_sarcastic,
|
|
23
|
+
HinglishAnalysis,
|
|
24
|
+
HinglishAnalyzer,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
__version__ = "0.2.0"
|
|
28
|
+
__author__ = "Lalit"
|
|
29
|
+
__all__ = [
|
|
30
|
+
"analyze",
|
|
31
|
+
"analyze_batch",
|
|
32
|
+
"transliterate",
|
|
33
|
+
"detect_language",
|
|
34
|
+
"detect_emotion",
|
|
35
|
+
"is_sarcastic",
|
|
36
|
+
"HinglishAnalysis",
|
|
37
|
+
"HinglishAnalyzer",
|
|
38
|
+
]
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Dict, List, Any, Optional
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# LEXICON
|
|
7
|
+
POSITIVE_WORDS = {
|
|
8
|
+
# General positive
|
|
9
|
+
"mast", "badiya", "zabardast", "awesome", "superb", "love", "best",
|
|
10
|
+
"perfect", "great", "achha", "accha", "nice", "kamaal", "shandaar",
|
|
11
|
+
"dhamaka", "bindaas", "jhakaas", "wah", "waah", "solid", "killer",
|
|
12
|
+
"dope", "lit", "fire", "too good", "ekdum sahi", "full mast",
|
|
13
|
+
"khush", "khushi", "maza", "party", "celebrate", "lajawaab",
|
|
14
|
+
"shaandaar", "gazab", "jalwa", "badhiya", "outstanding", "excellent",
|
|
15
|
+
"wonderful", "fantastic", "brilliant", "amazing", "superb", "fabulous",
|
|
16
|
+
"incredible", "marvelous", "splendid", "terrific", "spectacular",
|
|
17
|
+
"magnificent", "glorious", "superb", "lovely", "beautiful", "gorgeous",
|
|
18
|
+
"stunning", "charming", "delightful", "enjoyable", "fun", "exciting",
|
|
19
|
+
"thrilling", "refreshing", "satisfying", "pleasing", "impressive",
|
|
20
|
+
"remarkable", "extraordinary", "phenomenal", "exceptional", "top",
|
|
21
|
+
"sahi", "bilkul sahi", "dum", "dum hai", "kya baat", "ekdum",
|
|
22
|
+
"maja", "maja aaya", "mst", "bdhiya", "zbrdsst", "kmaal",
|
|
23
|
+
"happy", "glad", "joyful", "cheerful", "content", "pleased",
|
|
24
|
+
"grateful", "thankful", "blessed", "fortunate", "lucky",
|
|
25
|
+
"proud", "confident", "hopeful", "optimistic", "positive",
|
|
26
|
+
"energetic", "enthusiastic", "passionate", "motivated", "inspired",
|
|
27
|
+
"relaxed", "peaceful", "calm", "comfortable", "satisfied",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
NEGATIVE_WORDS = {
|
|
31
|
+
# General negative
|
|
32
|
+
"bakwaas", "bekar", "kharaab", "worst", "faltu", "garbage",
|
|
33
|
+
"disappoint", "boring", "time waste", "bekaar", "ganda", "bura",
|
|
34
|
+
"kharab", "wahiyat", "forma", "forma", "useless", "pathetic",
|
|
35
|
+
"terrible", "horrible", "awful", "dreadful", "atrocious", "abysmal",
|
|
36
|
+
"disgusting", "revolting", "repulsive", "nasty", "vile", "foul",
|
|
37
|
+
"lousy", "poor", "inferior", "substandard", "inadequate", "deficient",
|
|
38
|
+
"flawed", "faulty", "broken", "damaged", "ruined", "destroyed",
|
|
39
|
+
"failed", "failure", "disaster", "catastrophe", "tragedy", "mess",
|
|
40
|
+
"problem", "issue", "trouble", "difficulty", "challenge", "obstacle",
|
|
41
|
+
"frustrating", "annoying", "irritating", "aggravating", "infuriating",
|
|
42
|
+
"disappointing", "unsatisfying", "dissatisfying", "displeasing",
|
|
43
|
+
"sad", "unhappy", "miserable", "depressed", "gloomy", "melancholy",
|
|
44
|
+
"upset", "distressed", "troubled", "worried", "anxious", "stressed",
|
|
45
|
+
"angry", "furious", "enraged", "outraged", "livid", "irate",
|
|
46
|
+
"disgusted", "repelled", "appalled", "horrified", "shocked",
|
|
47
|
+
"bored", "dull", "tedious", "monotonous", "repetitive", "bland",
|
|
48
|
+
"nahi", "mat", "bandh", "band", "chup", "shut",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
INTENSIFIERS = {
|
|
52
|
+
"bahut", "bohot", "bht", "very", "super", "ekdum", "bilkul",
|
|
53
|
+
"sabse", "itna", "utna", "kitna", "zyada", "boht", "bhot",
|
|
54
|
+
"extremely", "incredibly", "absolutely", "totally", "completely",
|
|
55
|
+
"utterly", "highly", "deeply", "strongly", "severely", "greatly",
|
|
56
|
+
"tremendously", "enormously", "immensely", "vastly", "profoundly",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
EMOTION_LEXICON = {
|
|
60
|
+
"anger": {
|
|
61
|
+
"gussa", "ghussa", "pagal", "bakwas", "beizzati", "insult",
|
|
62
|
+
"angry", "furious", "rage", "mad", "irritated", "annoyed",
|
|
63
|
+
"frustrated", "outraged", "livid", "enraged", "irate",
|
|
64
|
+
"aggressive", "hostile", "violent", "hateful", "resentful",
|
|
65
|
+
"jalega", "jala", "jalao", "maar", "marunga", "toot",
|
|
66
|
+
},
|
|
67
|
+
"joy": {
|
|
68
|
+
"khushi", "maza", "party", "celebrate", "mast", "khush",
|
|
69
|
+
"happy", "joyful", "cheerful", "delighted", "elated", "ecstatic",
|
|
70
|
+
"thrilled", "excited", "glad", "pleased", "content", "satisfied",
|
|
71
|
+
"blissful", "euphoric", "overjoyed", "jubilant", "radiant",
|
|
72
|
+
"maja", "maja aaya", "bindaas", "dhamaal", "moj", "masti",
|
|
73
|
+
},
|
|
74
|
+
"sadness": {
|
|
75
|
+
"dukh", "rona", "bura", "miss", "akela", "dard", "toot",
|
|
76
|
+
"sad", "unhappy", "miserable", "depressed", "gloomy", "melancholy",
|
|
77
|
+
"sorrowful", "heartbroken", "devastated", "grief", "mourning",
|
|
78
|
+
"lonely", "isolated", "abandoned", "rejected", "hurt", "pain",
|
|
79
|
+
"cry", "tears", "weep", "sob", "lament", "grieve",
|
|
80
|
+
"roya", "rote", "aansu", "tadap", "bichar",
|
|
81
|
+
},
|
|
82
|
+
"surprise": {
|
|
83
|
+
"kya", "seriously", "matlab", "no way", "sach", "sachchi",
|
|
84
|
+
"surprised", "shocked", "astonished", "amazed", "stunned",
|
|
85
|
+
"bewildered", "dumbfounded", "flabbergasted", "astounded",
|
|
86
|
+
"unexpected", "unbelievable", "incredible", "wow", "omg",
|
|
87
|
+
"arre", "arrey", "oye", "yaar sach", "are bhai",
|
|
88
|
+
},
|
|
89
|
+
"fear": {
|
|
90
|
+
"dar", "darr", "tension", "problem", "mushkil", "dara",
|
|
91
|
+
"afraid", "scared", "frightened", "terrified", "horrified",
|
|
92
|
+
"anxious", "nervous", "worried", "stressed", "panicked",
|
|
93
|
+
"dread", "terror", "phobia", "paranoid", "threatened",
|
|
94
|
+
"darr gaya", "darta hun", "daro mat",
|
|
95
|
+
},
|
|
96
|
+
"disgust": {
|
|
97
|
+
"yuck", "chhi", "ganda", "ulti", "bura laga", "nafrat",
|
|
98
|
+
"disgusting", "revolting", "repulsive", "nasty", "vile",
|
|
99
|
+
"gross", "horrible", "awful", "terrible", "dreadful",
|
|
100
|
+
"loathsome", "abhorrent", "detestable", "despicable",
|
|
101
|
+
"hate", "hatred", "abhor", "detest", "despise", "loathe",
|
|
102
|
+
},
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
SARCASM_PATTERNS = [
|
|
106
|
+
r"bahut\s+acch[ao]\s+hai\s*na",
|
|
107
|
+
r"haan\s+bilkul",
|
|
108
|
+
r"wah\s+kya\s+baat",
|
|
109
|
+
r"!{2,}",
|
|
110
|
+
r"(?:oh\s+)?sure\s+yaar",
|
|
111
|
+
r"bilkul\s+sahi\s+(?:hai|tha|thi)",
|
|
112
|
+
r"kitna\s+(?:accha|mast|badiya)\s+(?:hai|tha|thi)\s*(?:na|yaar)?",
|
|
113
|
+
r"great\s+yaar",
|
|
114
|
+
r"very\s+nice\s+(?:yaar|bhai)",
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
HINGLISH_WORDS = {
|
|
118
|
+
"yaar", "bhai", "mein", "hai", "toh", "kya", "nahi", "haan",
|
|
119
|
+
"abhi", "kal", "aaj", "phir", "kab", "kahan", "kyun", "kaisa",
|
|
120
|
+
"accha", "achha", "theek", "sahi", "matlab", "samjha", "dekho",
|
|
121
|
+
"suno", "bolo", "jao", "aao", "ruko", "chalo", "batao",
|
|
122
|
+
"mast", "badiya", "zabardast", "bindaas", "dhamaka", "kamaal",
|
|
123
|
+
"bilkul", "ekdum", "bahut", "bohot", "itna", "zyada",
|
|
124
|
+
"ghar", "dost", "paisa", "kaam", "time", "baat", "cheez",
|
|
125
|
+
"log", "aadmi", "ladka", "ladki", "bachha", "mama", "papa",
|
|
126
|
+
"khana", "pani", "chai", "coffee", "movie", "gana", "game",
|
|
127
|
+
"phone", "laptop", "net", "wifi", "app", "online",
|
|
128
|
+
"arre", "arrey", "oye", "yaar", "bhai", "boss", "dude",
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
TRANSLITERATION_MAP = {
|
|
132
|
+
"mera": "मेरा", "tera": "तेरा", "uska": "उसका", "hamara": "हमारा",
|
|
133
|
+
"naam": "नाम", "ghar": "घर", "dost": "दोस्त", "pyaar": "प्यार",
|
|
134
|
+
"khushi": "खुशी", "dukh": "दुःख", "zindagi": "ज़िंदगी",
|
|
135
|
+
"yaar": "यार", "bhai": "भाई", "kya": "क्या", "hai": "है",
|
|
136
|
+
"nahi": "नहीं", "haan": "हाँ", "accha": "अच्छा", "achha": "अच्छा",
|
|
137
|
+
"bahut": "बहुत", "bohot": "बहुत", "mast": "मस्त",
|
|
138
|
+
"badiya": "बढ़िया", "zabardast": "ज़बरदस्त", "kamaal": "कमाल",
|
|
139
|
+
"theek": "ठीक", "sahi": "सही", "galat": "गलत",
|
|
140
|
+
"khana": "खाना", "pani": "पानी", "chai": "चाय",
|
|
141
|
+
"aaj": "आज", "kal": "कल", "abhi": "अभी",
|
|
142
|
+
"main": "मैं", "mein": "में", "toh": "तो",
|
|
143
|
+
"kaam": "काम", "paisa": "पैसा", "time": "टाइम",
|
|
144
|
+
"phone": "फ़ोन", "movie": "मूवी", "gana": "गाना",
|
|
145
|
+
"dil": "दिल", "aankhein": "आँखें", "haath": "हाथ",
|
|
146
|
+
"gussa": "गुस्सा", "dar": "डर", "khauf": "ख़ौफ़",
|
|
147
|
+
"hasna": "हँसना", "rona": "रोना", "bolna": "बोलना",
|
|
148
|
+
"sunna": "सुनना", "dekhna": "देखना", "jaana": "जाना",
|
|
149
|
+
"aana": "आना", "karna": "करना", "rehna": "रहना",
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# DATACLASS
|
|
154
|
+
@dataclass
|
|
155
|
+
class HinglishAnalysis:
|
|
156
|
+
mood: str
|
|
157
|
+
intensity: float
|
|
158
|
+
confidence: float
|
|
159
|
+
emoji: str
|
|
160
|
+
sentiment: str
|
|
161
|
+
key_phrases: List[str]
|
|
162
|
+
sarcasm: bool
|
|
163
|
+
sarcasm_confidence: float
|
|
164
|
+
language_mix: Dict[str, float]
|
|
165
|
+
category: str
|
|
166
|
+
summary: str
|
|
167
|
+
emotions: Dict[str, float]
|
|
168
|
+
word_count: int
|
|
169
|
+
positive_words_found: List[str]
|
|
170
|
+
negative_words_found: List[str]
|
|
171
|
+
transliteration: Optional[str] = None
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# MAIN ANALYZER
|
|
175
|
+
class HinglishAnalyzer:
|
|
176
|
+
|
|
177
|
+
# ── Language Detection ──────────────────
|
|
178
|
+
def detect_language_mix(self, text: str) -> Dict[str, float]:
|
|
179
|
+
words = re.findall(r'\w+', text.lower())
|
|
180
|
+
if not words:
|
|
181
|
+
return {"hinglish": 0.5, "english": 0.5, "unknown": 0.0}
|
|
182
|
+
|
|
183
|
+
hinglish_count = sum(1 for w in words if w in HINGLISH_WORDS)
|
|
184
|
+
# simple English heuristic: words NOT in hinglish set, length > 2
|
|
185
|
+
english_count = sum(
|
|
186
|
+
1 for w in words
|
|
187
|
+
if w not in HINGLISH_WORDS and len(w) > 2 and w.isalpha()
|
|
188
|
+
)
|
|
189
|
+
total = len(words)
|
|
190
|
+
unknown = max(0, total - hinglish_count - english_count)
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
"hinglish": round(hinglish_count / total, 2),
|
|
194
|
+
"english": round(english_count / total, 2),
|
|
195
|
+
"unknown": round(unknown / total, 2),
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
# ── Sarcasm Detection ───────────────────
|
|
199
|
+
def detect_sarcasm(self, text: str) -> Dict[str, Any]:
|
|
200
|
+
text_lower = text.lower()
|
|
201
|
+
matched = [p for p in SARCASM_PATTERNS if re.search(p, text_lower)]
|
|
202
|
+
|
|
203
|
+
# Contradiction: positive word + negative context
|
|
204
|
+
words = set(re.findall(r'\w+', text_lower))
|
|
205
|
+
has_positive = bool(words & POSITIVE_WORDS)
|
|
206
|
+
has_negative = bool(words & NEGATIVE_WORDS)
|
|
207
|
+
contradiction = has_positive and has_negative
|
|
208
|
+
|
|
209
|
+
score = len(matched) * 0.3 + (0.2 if contradiction else 0)
|
|
210
|
+
score = min(1.0, score)
|
|
211
|
+
return {"is_sarcastic": score > 0.3, "confidence": round(score, 2)}
|
|
212
|
+
|
|
213
|
+
# ── Emotion Detection ───────────────────
|
|
214
|
+
def detect_emotions(self, words: List[str]) -> Dict[str, float]:
|
|
215
|
+
word_set = set(words)
|
|
216
|
+
scores = {}
|
|
217
|
+
for emotion, lexicon in EMOTION_LEXICON.items():
|
|
218
|
+
matches = len(word_set & lexicon)
|
|
219
|
+
if matches:
|
|
220
|
+
scores[emotion] = round(min(1.0, matches * 0.35), 2)
|
|
221
|
+
return scores if scores else {"neutral": 1.0}
|
|
222
|
+
|
|
223
|
+
# ── Confidence ──────────────────────────
|
|
224
|
+
def calculate_confidence(self, pos: int, neg: int, total: int) -> float:
|
|
225
|
+
signal = (pos + neg) / max(total, 1)
|
|
226
|
+
if signal > 0.3:
|
|
227
|
+
return 0.92
|
|
228
|
+
elif signal > 0.15:
|
|
229
|
+
return 0.75
|
|
230
|
+
elif signal > 0.05:
|
|
231
|
+
return 0.60
|
|
232
|
+
return 0.45
|
|
233
|
+
|
|
234
|
+
# ── Transliteration ─────────────────────
|
|
235
|
+
def transliterate(self, text: str) -> str:
|
|
236
|
+
words = text.split()
|
|
237
|
+
result = []
|
|
238
|
+
for word in words:
|
|
239
|
+
clean = re.sub(r'[^\w]', '', word.lower())
|
|
240
|
+
result.append(TRANSLITERATION_MAP.get(clean, word))
|
|
241
|
+
return " ".join(result)
|
|
242
|
+
|
|
243
|
+
# ── Key Phrases ─────────────────────────
|
|
244
|
+
def extract_key_phrases(self, text: str) -> List[str]:
|
|
245
|
+
sentences = re.split(r'[.!?,]', text)
|
|
246
|
+
phrases = []
|
|
247
|
+
for s in sentences:
|
|
248
|
+
s = s.strip()
|
|
249
|
+
if len(s) > 3:
|
|
250
|
+
phrases.append(s[:80])
|
|
251
|
+
return phrases[:4]
|
|
252
|
+
|
|
253
|
+
# ── Summary ─────────────────────────────
|
|
254
|
+
def generate_summary(self, text: str, mood: str, emotions: Dict) -> str:
|
|
255
|
+
top_emotion = max(emotions, key=emotions.get) if emotions else "neutral"
|
|
256
|
+
length = "short" if len(text.split()) < 6 else "detailed"
|
|
257
|
+
return (
|
|
258
|
+
f"A {length} Hinglish message expressing {mood} sentiment "
|
|
259
|
+
f"with primary emotion: {top_emotion}."
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# ── Main Analyze ────────────────────────
|
|
263
|
+
def analyze(self, text: str) -> HinglishAnalysis:
|
|
264
|
+
if not text or not text.strip():
|
|
265
|
+
return self._neutral_result()
|
|
266
|
+
|
|
267
|
+
text_lower = text.lower()
|
|
268
|
+
words = re.findall(r'\w+', text_lower)
|
|
269
|
+
|
|
270
|
+
pos_found = [w for w in words if w in POSITIVE_WORDS]
|
|
271
|
+
neg_found = [w for w in words if w in NEGATIVE_WORDS]
|
|
272
|
+
intens_count = sum(1 for w in words if w in INTENSIFIERS)
|
|
273
|
+
|
|
274
|
+
pos_count = len(pos_found)
|
|
275
|
+
neg_count = len(neg_found)
|
|
276
|
+
|
|
277
|
+
# Intensity
|
|
278
|
+
intensity = min(
|
|
279
|
+
1.0,
|
|
280
|
+
(pos_count + neg_count) * 0.20
|
|
281
|
+
+ intens_count * 0.15
|
|
282
|
+
+ len(words) * 0.01
|
|
283
|
+
)
|
|
284
|
+
intensity = round(intensity, 2)
|
|
285
|
+
|
|
286
|
+
# Mood & category
|
|
287
|
+
if pos_count > neg_count + 1:
|
|
288
|
+
mood, emoji, category = "positive", "😊", "praise"
|
|
289
|
+
elif neg_count > pos_count + 1:
|
|
290
|
+
mood, emoji, category = "negative", "😠", "complaint"
|
|
291
|
+
elif pos_count > 0 and neg_count > 0:
|
|
292
|
+
mood, emoji, category = "mixed", "🤨", "mixed"
|
|
293
|
+
else:
|
|
294
|
+
mood, emoji, category = "neutral", "😐", "casual"
|
|
295
|
+
|
|
296
|
+
sarcasm_result = self.detect_sarcasm(text)
|
|
297
|
+
emotions = self.detect_emotions(words)
|
|
298
|
+
lang_mix = self.detect_language_mix(text)
|
|
299
|
+
confidence = self.calculate_confidence(pos_count, neg_count, len(words))
|
|
300
|
+
key_phrases = self.extract_key_phrases(text)
|
|
301
|
+
summary = self.generate_summary(text, mood, emotions)
|
|
302
|
+
transliteration = self.transliterate(text)
|
|
303
|
+
|
|
304
|
+
return HinglishAnalysis(
|
|
305
|
+
mood=mood,
|
|
306
|
+
intensity=intensity,
|
|
307
|
+
confidence=confidence,
|
|
308
|
+
emoji=emoji,
|
|
309
|
+
sentiment=mood,
|
|
310
|
+
key_phrases=key_phrases,
|
|
311
|
+
sarcasm=sarcasm_result["is_sarcastic"],
|
|
312
|
+
sarcasm_confidence=sarcasm_result["confidence"],
|
|
313
|
+
language_mix=lang_mix,
|
|
314
|
+
category=category,
|
|
315
|
+
summary=summary,
|
|
316
|
+
emotions=emotions,
|
|
317
|
+
word_count=len(words),
|
|
318
|
+
positive_words_found=list(set(pos_found)),
|
|
319
|
+
negative_words_found=list(set(neg_found)),
|
|
320
|
+
transliteration=transliteration,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
def _neutral_result(self) -> HinglishAnalysis:
|
|
324
|
+
return HinglishAnalysis(
|
|
325
|
+
mood="neutral", intensity=0.0, confidence=0.5, emoji="😐",
|
|
326
|
+
sentiment="neutral", key_phrases=[], sarcasm=False,
|
|
327
|
+
sarcasm_confidence=0.0,
|
|
328
|
+
language_mix={"hinglish": 0.5, "english": 0.5, "unknown": 0.0},
|
|
329
|
+
category="casual", summary="Empty or neutral text.",
|
|
330
|
+
emotions={"neutral": 1.0}, word_count=0,
|
|
331
|
+
positive_words_found=[], negative_words_found=[],
|
|
332
|
+
transliteration="",
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
# PUBLIC API
|
|
337
|
+
_analyzer = HinglishAnalyzer()
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def analyze(text: str) -> Dict[str, Any]:
|
|
341
|
+
"""Analyze a single Hinglish text."""
|
|
342
|
+
return _analyzer.analyze(text).__dict__
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def analyze_batch(texts: List[str]) -> List[Dict[str, Any]]:
|
|
346
|
+
"""Analyze multiple Hinglish texts at once."""
|
|
347
|
+
return [analyze(t) for t in texts]
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def transliterate(text: str) -> str:
|
|
351
|
+
"""Convert Roman Hinglish to Devanagari script."""
|
|
352
|
+
return _analyzer.transliterate(text)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def detect_language(text: str) -> Dict[str, float]:
|
|
356
|
+
"""Detect language mix in text."""
|
|
357
|
+
return _analyzer.detect_language_mix(text)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def detect_emotion(text: str) -> Dict[str, float]:
|
|
361
|
+
"""Detect emotions in text."""
|
|
362
|
+
words = re.findall(r'\w+', text.lower())
|
|
363
|
+
return _analyzer.detect_emotions(words)
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def is_sarcastic(text: str) -> Dict[str, Any]:
|
|
367
|
+
"""Check if text is sarcastic."""
|
|
368
|
+
return _analyzer.detect_sarcasm(text)
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hinglish-nlp
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Hinglish (Roman Hindi + English) NLP toolkit - Sentiment, Emotion, Sarcasm & more
|
|
5
|
+
Author-email: Lalit <official.lalitpal08@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Lalit2206/hinglish-nlp
|
|
8
|
+
Project-URL: Repository, https://github.com/Lalit2206/hinglish-nlp
|
|
9
|
+
Keywords: hinglish,nlp,sentiment,hindi,roman-hindi,text-analysis
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
14
|
+
Classifier: Natural Language :: Hindi
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: pydantic>=2.0
|
|
18
|
+
|
|
19
|
+
# hinglish-nlp 🇮🇳
|
|
20
|
+
|
|
21
|
+
A powerful NLP toolkit for **Hinglish** (Roman Hindi + English) text analysis.
|
|
22
|
+
|
|
23
|
+
[](https://badge.fury.io/py/hinglish-nlp)
|
|
24
|
+
[](https://www.python.org/downloads/)
|
|
25
|
+
[](https://opensource.org/licenses/MIT)
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install hinglish-nlp
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Features
|
|
38
|
+
|
|
39
|
+
| Feature | Description |
|
|
40
|
+
|--------|-------------|
|
|
41
|
+
| ✅ Sentiment Analysis | Positive / Negative / Neutral / Mixed |
|
|
42
|
+
| ✅ Emotion Detection | Joy, Anger, Sadness, Fear, Surprise, Disgust |
|
|
43
|
+
| ✅ Sarcasm Detection | Pattern + contradiction based |
|
|
44
|
+
| ✅ Language Mix Detection | Hinglish vs English ratio |
|
|
45
|
+
| ✅ Transliteration | Roman Hindi → Devanagari |
|
|
46
|
+
| ✅ Batch Processing | Multiple texts at once |
|
|
47
|
+
| ✅ Key Phrase Extraction | Important phrases from text |
|
|
48
|
+
| ✅ Intensity Score | 0.0 to 1.0 scale |
|
|
49
|
+
| ✅ Confidence Score | How sure the model is |
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Usage
|
|
54
|
+
|
|
55
|
+
### Basic Sentiment Analysis
|
|
56
|
+
```python
|
|
57
|
+
from hinglish import analyze
|
|
58
|
+
|
|
59
|
+
result = analyze("yaar bahut mast movie thi!")
|
|
60
|
+
print(result["mood"]) # positive
|
|
61
|
+
print(result["emoji"]) # 😊
|
|
62
|
+
print(result["intensity"]) # 0.45
|
|
63
|
+
print(result["confidence"]) # 0.75
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Emotion Detection
|
|
67
|
+
```python
|
|
68
|
+
from hinglish import detect_emotion
|
|
69
|
+
|
|
70
|
+
emotions = detect_emotion("mujhe bahut gussa aa raha hai!")
|
|
71
|
+
print(emotions) # {'anger': 0.35}
|
|
72
|
+
|
|
73
|
+
emotions = detect_emotion("aaj bahut khushi hui yaar!")
|
|
74
|
+
print(emotions) # {'joy': 0.7}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Sarcasm Detection
|
|
78
|
+
```python
|
|
79
|
+
from hinglish import is_sarcastic
|
|
80
|
+
|
|
81
|
+
result = is_sarcastic("haan bilkul, bahut accha hai na!!")
|
|
82
|
+
print(result)
|
|
83
|
+
# {'is_sarcastic': True, 'confidence': 0.6}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Language Mix Detection
|
|
87
|
+
```python
|
|
88
|
+
from hinglish import detect_language
|
|
89
|
+
|
|
90
|
+
mix = detect_language("yaar ye movie bahut boring thi")
|
|
91
|
+
print(mix)
|
|
92
|
+
# {'hinglish': 0.5, 'english': 0.33, 'unknown': 0.17}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Transliteration (Roman → Devanagari)
|
|
96
|
+
```python
|
|
97
|
+
from hinglish import transliterate
|
|
98
|
+
|
|
99
|
+
text = transliterate("mera naam lalit hai")
|
|
100
|
+
print(text) # मेरा नाम ललित है
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Batch Processing
|
|
104
|
+
```python
|
|
105
|
+
from hinglish import analyze_batch
|
|
106
|
+
|
|
107
|
+
texts = [
|
|
108
|
+
"yaar mast movie thi!",
|
|
109
|
+
"bilkul bakwaas tha yaar",
|
|
110
|
+
"theek thak tha, kuch khaas nahi"
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
results = analyze_batch(texts)
|
|
114
|
+
for r in results:
|
|
115
|
+
print(r["mood"], r["emoji"])
|
|
116
|
+
# positive 😊
|
|
117
|
+
# negative 😠
|
|
118
|
+
# neutral 😐
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Full Analysis
|
|
122
|
+
```python
|
|
123
|
+
from hinglish import analyze
|
|
124
|
+
|
|
125
|
+
result = analyze("Phone ki battery toh bekar hai but camera mast hai")
|
|
126
|
+
print(result)
|
|
127
|
+
# {
|
|
128
|
+
# 'mood': 'mixed',
|
|
129
|
+
# 'intensity': 0.3,
|
|
130
|
+
# 'confidence': 0.75,
|
|
131
|
+
# 'emoji': '🤨',
|
|
132
|
+
# 'sentiment': 'mixed',
|
|
133
|
+
# 'key_phrases': ['Phone ki battery toh bekar hai but camera mast hai'],
|
|
134
|
+
# 'sarcasm': False,
|
|
135
|
+
# 'sarcasm_confidence': 0.0,
|
|
136
|
+
# 'language_mix': {'hinglish': 0.36, 'english': 0.55, 'unknown': 0.09},
|
|
137
|
+
# 'category': 'mixed',
|
|
138
|
+
# 'summary': 'A detailed Hinglish message expressing mixed sentiment...',
|
|
139
|
+
# 'emotions': {'disgust': 0.35},
|
|
140
|
+
# 'word_count': 11,
|
|
141
|
+
# 'positive_words_found': ['mast'],
|
|
142
|
+
# 'negative_words_found': ['bekar'],
|
|
143
|
+
# 'transliteration': 'Phone की battery तो bekar है but camera मस्त है'
|
|
144
|
+
# }
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Output Fields
|
|
150
|
+
|
|
151
|
+
| Field | Type | Description |
|
|
152
|
+
|-------|------|-------------|
|
|
153
|
+
| `mood` | str | positive / negative / neutral / mixed |
|
|
154
|
+
| `intensity` | float | 0.0 – 1.0 |
|
|
155
|
+
| `confidence` | float | 0.0 – 1.0 |
|
|
156
|
+
| `emoji` | str | Visual mood indicator |
|
|
157
|
+
| `sentiment` | str | Same as mood |
|
|
158
|
+
| `key_phrases` | list | Important phrases |
|
|
159
|
+
| `sarcasm` | bool | Is text sarcastic? |
|
|
160
|
+
| `sarcasm_confidence` | float | Sarcasm confidence score |
|
|
161
|
+
| `language_mix` | dict | hinglish / english / unknown ratio |
|
|
162
|
+
| `category` | str | praise / complaint / casual / mixed |
|
|
163
|
+
| `summary` | str | Short summary of the text |
|
|
164
|
+
| `emotions` | dict | Detected emotions with scores |
|
|
165
|
+
| `word_count` | int | Total word count |
|
|
166
|
+
| `positive_words_found` | list | Positive words detected |
|
|
167
|
+
| `negative_words_found` | list | Negative words detected |
|
|
168
|
+
| `transliteration` | str | Roman → Devanagari |
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Author
|
|
173
|
+
|
|
174
|
+
**Lalit** — [lalitpal2206](https://pypi.org/user/lalitpal2206/)
|
|
175
|
+
|
|
176
|
+
## License
|
|
177
|
+
|
|
178
|
+
MIT License
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pydantic>=2.0
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "hinglish-nlp"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Hinglish (Roman Hindi + English) NLP toolkit - Sentiment, Emotion, Sarcasm & more"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.9"
|
|
7
|
+
license = {text = "MIT"}
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "Lalit", email = "official.lalitpal08@gmail.com"}]
|
|
10
|
+
keywords = ["hinglish", "nlp", "sentiment", "hindi", "roman-hindi", "text-analysis"]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Programming Language :: Python :: 3",
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Operating System :: OS Independent",
|
|
15
|
+
"Topic :: Text Processing :: Linguistic",
|
|
16
|
+
"Natural Language :: Hindi",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
dependencies = [
|
|
20
|
+
"pydantic>=2.0",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.urls]
|
|
24
|
+
Homepage = "https://github.com/Lalit2206/hinglish-nlp"
|
|
25
|
+
Repository = "https://github.com/Lalit2206/hinglish-nlp"
|
|
26
|
+
|
|
27
|
+
[tool.setuptools.packages.find]
|
|
28
|
+
where = ["."]
|