aize 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aize/__init__.py ADDED
@@ -0,0 +1,32 @@
1
+ """
2
+ aize — NLP Analysis Toolkit
3
+ A lightweight, pip-installable library for text analysis.
4
+ """
5
+
6
+ from aize.analysis.stats import compute_stats
7
+ from aize.analysis.groupwords import analyze_groupwords
8
+ from aize.analysis.zipf import analyze_zipf
9
+ from aize.analysis.heaps import analyze_heaps
10
+ from aize.analysis.stopwords import calculate_density
11
+ from aize.analysis.vocab import compare_vocab
12
+ from aize.analysis.tfidf import compute_tfidf, compute_ngrams
13
+ from aize.analysis.sentiment import analyze_sentiment
14
+ from aize.analysis.readability import compute_readability
15
+ from aize.analysis.pos import analyze_pos
16
+ from aize.analysis.wordcloud_gen import generate_wordcloud
17
+
18
+ __version__ = "0.1.0"
19
+ __all__ = [
20
+ "compute_stats",
21
+ "analyze_groupwords",
22
+ "analyze_zipf",
23
+ "analyze_heaps",
24
+ "calculate_density",
25
+ "compare_vocab",
26
+ "compute_tfidf",
27
+ "compute_ngrams",
28
+ "analyze_sentiment",
29
+ "compute_readability",
30
+ "analyze_pos",
31
+ "generate_wordcloud",
32
+ ]
@@ -0,0 +1 @@
1
+ # aize analysis sub-package
@@ -0,0 +1,21 @@
1
+ """Word-length (groupwords) distribution analysis."""
2
+ import re
3
+
4
+
5
+ def analyze_groupwords(text: str) -> dict:
6
+ """
7
+ Group unique words by their character length.
8
+ Returns a dict: {length: count_of_unique_words}
9
+ """
10
+ groups: dict[int, set] = {}
11
+ words = text.split()
12
+ for word in words:
13
+ word = word.lower().strip('.,!?"():;[]#«»')
14
+ if not word:
15
+ continue
16
+ size = len(word)
17
+ if size not in groups:
18
+ groups[size] = set()
19
+ groups[size].add(word)
20
+
21
+ return {size: len(words_set) for size, words_set in sorted(groups.items())}
aize/analysis/heaps.py ADDED
@@ -0,0 +1,40 @@
1
+ """Heap's Law — vocabulary growth (types vs tokens)."""
2
+ import re
3
+
4
+
5
+ def analyze_heaps(text: str, sample_every: int = 100) -> dict:
6
+ """
7
+ Compute vocabulary growth as tokens are consumed.
8
+
9
+ Returns:
10
+ {
11
+ "tokens": [int, ...], # token counts sampled every `sample_every`
12
+ "types": [int, ...], # unique word counts at each sample
13
+ "total_tokens": int,
14
+ "total_types": int,
15
+ "diversity_pct": float, # types/tokens * 100
16
+ }
17
+ """
18
+ words = re.findall(r'\b[A-Za-z][a-z]{2,9}\b', text)
19
+ frequency: dict[str, int] = {}
20
+ x, y = [], []
21
+ types = tokens = 0
22
+
23
+ for word in words:
24
+ tokens += 1
25
+ word = word.lower()
26
+ if frequency.get(word, 0) == 0:
27
+ types += 1
28
+ frequency[word] = frequency.get(word, 0) + 1
29
+ if tokens % sample_every == 0:
30
+ x.append(tokens)
31
+ y.append(types)
32
+
33
+ diversity = round(types / tokens * 100, 2) if tokens > 0 else 0
34
+ return {
35
+ "tokens": x,
36
+ "types": y,
37
+ "total_tokens": tokens,
38
+ "total_types": types,
39
+ "diversity_pct": diversity,
40
+ }
aize/analysis/pos.py ADDED
@@ -0,0 +1,59 @@
1
+ """Part-of-speech tag distribution using NLTK."""
2
+ import re
3
+ import nltk
4
+
5
+ # Support both old (averaged_perceptron_tagger) and new (averaged_perceptron_tagger_eng)
6
+ # NLTK versions — download whichever is missing.
7
+ for _resource in ("averaged_perceptron_tagger", "averaged_perceptron_tagger_eng"):
8
+ try:
9
+ nltk.data.find(f"taggers/{_resource}")
10
+ except LookupError:
11
+ nltk.download(_resource, quiet=True)
12
+
13
+ for _resource in ("punkt", "punkt_tab"):
14
+ try:
15
+ nltk.data.find(f"tokenizers/{_resource}")
16
+ except LookupError:
17
+ nltk.download(_resource, quiet=True)
18
+
19
+ # POS tag groups (Penn Treebank)
20
+ _TAG_GROUPS = {
21
+ "Noun": {"NN", "NNS", "NNP", "NNPS"},
22
+ "Verb": {"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"},
23
+ "Adjective": {"JJ", "JJR", "JJS"},
24
+ "Adverb": {"RB", "RBR", "RBS"},
25
+ "Pronoun": {"PRP", "PRP$", "WP", "WP$"},
26
+ "Determiner": {"DT"},
27
+ "Preposition": {"IN"},
28
+ "Conjunction": {"CC"},
29
+ "Other": set(),
30
+ }
31
+
32
+
33
+ def analyze_pos(text: str) -> dict:
34
+ """
35
+ Return grouped POS tag counts.
36
+
37
+ Returns:
38
+ {"Noun": int, "Verb": int, "Adjective": int, ...}
39
+ """
40
+ # Work on a sample to keep it fast for large files
41
+ sample = text[:10_000] # POS quality plateaus well before 10k chars; cap for speed
42
+ try:
43
+ tokens = nltk.word_tokenize(sample)
44
+ except Exception:
45
+ tokens = sample.split()
46
+ tagged = nltk.pos_tag(tokens)
47
+
48
+ counts = {group: 0 for group in _TAG_GROUPS}
49
+ for _, tag in tagged:
50
+ matched = False
51
+ for group, tag_set in _TAG_GROUPS.items():
52
+ if tag in tag_set:
53
+ counts[group] += 1
54
+ matched = True
55
+ break
56
+ if not matched:
57
+ counts["Other"] += 1
58
+
59
+ return counts
@@ -0,0 +1,59 @@
1
+ """Flesch-Kincaid readability scores."""
2
+ import re
3
+
4
+
5
+ def compute_readability(text: str) -> dict:
6
+ """
7
+ Compute Flesch Reading Ease and Flesch-Kincaid Grade Level.
8
+
9
+ Returns:
10
+ {
11
+ "flesch_reading_ease": float, # 0-100, higher = easier
12
+ "fk_grade_level": float, # US school grade level
13
+ "sentences": int,
14
+ "words": int,
15
+ "syllables": int,
16
+ "interpretation": str,
17
+ }
18
+ """
19
+ sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
20
+ words = re.findall(r'\b[a-zA-Z]+\b', text)
21
+ num_sentences = max(len(sentences), 1)
22
+ num_words = max(len(words), 1)
23
+
24
+ def count_syllables(word: str) -> int:
25
+ word = word.lower()
26
+ count = len(re.findall(r'[aeiouy]+', word))
27
+ if word.endswith('e') and count > 1:
28
+ count -= 1
29
+ return max(count, 1)
30
+
31
+ num_syllables = sum(count_syllables(w) for w in words)
32
+
33
+ asl = num_words / num_sentences # avg sentence length
34
+ asw = num_syllables / num_words # avg syllables per word
35
+
36
+ fre = 206.835 - (1.015 * asl) - (84.6 * asw)
37
+ fkgl = (0.39 * asl) + (11.8 * asw) - 15.59
38
+
39
+ if fre >= 90:
40
+ interp = "Very Easy"
41
+ elif fre >= 70:
42
+ interp = "Easy"
43
+ elif fre >= 60:
44
+ interp = "Standard"
45
+ elif fre >= 50:
46
+ interp = "Fairly Difficult"
47
+ elif fre >= 30:
48
+ interp = "Difficult"
49
+ else:
50
+ interp = "Very Confusing"
51
+
52
+ return {
53
+ "flesch_reading_ease": round(fre, 2),
54
+ "fk_grade_level": round(fkgl, 2),
55
+ "sentences": num_sentences,
56
+ "words": num_words,
57
+ "syllables": num_syllables,
58
+ "interpretation": interp,
59
+ }
@@ -0,0 +1,40 @@
1
+ """VADER sentiment analysis."""
2
+ import nltk
3
+
4
+ try:
5
+ from nltk.sentiment.vader import SentimentIntensityAnalyzer
6
+ _sid = SentimentIntensityAnalyzer()
7
+ except LookupError:
8
+ nltk.download("vader_lexicon", quiet=True)
9
+ from nltk.sentiment.vader import SentimentIntensityAnalyzer
10
+ _sid = SentimentIntensityAnalyzer()
11
+
12
+
13
+ def analyze_sentiment(text: str) -> dict:
14
+ """
15
+ Run VADER sentiment analysis on the given text.
16
+
17
+ Returns:
18
+ {
19
+ "positive": float,
20
+ "negative": float,
21
+ "neutral": float,
22
+ "compound": float, # -1 (most negative) to +1 (most positive)
23
+ "label": str, # "Positive" | "Negative" | "Neutral"
24
+ }
25
+ """
26
+ scores = _sid.polarity_scores(text)
27
+ if scores["compound"] >= 0.05:
28
+ label = "Positive"
29
+ elif scores["compound"] <= -0.05:
30
+ label = "Negative"
31
+ else:
32
+ label = "Neutral"
33
+
34
+ return {
35
+ "positive": round(scores["pos"], 4),
36
+ "negative": round(scores["neg"], 4),
37
+ "neutral": round(scores["neu"], 4),
38
+ "compound": round(scores["compound"], 4),
39
+ "label": label,
40
+ }
aize/analysis/stats.py ADDED
@@ -0,0 +1,16 @@
1
+ """Word/line/char/space statistics for a text."""
2
+ import re
3
+
4
+
5
+ def compute_stats(text: str) -> dict:
6
+ """Return basic text statistics."""
7
+ lines = text.splitlines()
8
+ words = text.split()
9
+ chars = sum(1 for c in text if c not in (" ", "\n"))
10
+ spaces = text.count(" ")
11
+ return {
12
+ "lines": len(lines),
13
+ "words": len(words),
14
+ "characters": chars,
15
+ "spaces": spaces,
16
+ }
@@ -0,0 +1,40 @@
1
+ """Stop-word density analysis using NLTK stopwords."""
2
+ import re
3
+ import nltk
4
+
5
+ try:
6
+ from nltk.corpus import stopwords as nltk_sw
7
+ _en = set(nltk_sw.words("english"))
8
+ _es = set(nltk_sw.words("spanish"))
9
+ except LookupError:
10
+ nltk.download("stopwords", quiet=True)
11
+ from nltk.corpus import stopwords as nltk_sw
12
+ _en = set(nltk_sw.words("english"))
13
+ _es = set(nltk_sw.words("spanish"))
14
+
15
+ STOPWORD_SETS = {"english": _en, "spanish": _es}
16
+
17
+
18
+ def calculate_density(text: str, language: str = "english") -> dict:
19
+ """
20
+ Compute stop-word density for the given text.
21
+
22
+ Returns:
23
+ {
24
+ "total_words": int,
25
+ "stop_words": int,
26
+ "density_pct": float,
27
+ "language": str,
28
+ }
29
+ """
30
+ sw_set = STOPWORD_SETS.get(language, _en)
31
+ words = re.findall(r'\b\w+\b', text.lower())
32
+ if not words:
33
+ return {"total_words": 0, "stop_words": 0, "density_pct": 0.0, "language": language}
34
+ stop_count = sum(1 for w in words if w in sw_set)
35
+ return {
36
+ "total_words": len(words),
37
+ "stop_words": stop_count,
38
+ "density_pct": round(stop_count / len(words) * 100, 4),
39
+ "language": language,
40
+ }
aize/analysis/tfidf.py ADDED
@@ -0,0 +1,47 @@
1
+ """TF-IDF keyword extraction and N-gram frequency analysis."""
2
+ import re
3
+ from collections import Counter
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+
6
+
7
+ def compute_tfidf(texts: list[str], labels: list[str], top_n: int = 15) -> dict:
8
+ """
9
+ Compute TF-IDF scores across a corpus of texts.
10
+
11
+ Args:
12
+ texts: list of raw text strings
13
+ labels: matching list of file/document names
14
+ top_n: number of top keywords to return per document
15
+
16
+ Returns:
17
+ {label: [(term, score), ...], ...}
18
+ """
19
+ if not texts:
20
+ return {}
21
+ vec = TfidfVectorizer(stop_words="english", max_features=5000,
22
+ token_pattern=r'\b[a-z]{3,}\b')
23
+ try:
24
+ matrix = vec.fit_transform(texts)
25
+ except ValueError:
26
+ return {}
27
+
28
+ terms = vec.get_feature_names_out()
29
+ result = {}
30
+ for idx, label in enumerate(labels):
31
+ scores = matrix[idx].toarray().flatten()
32
+ top_idx = scores.argsort()[::-1][:top_n]
33
+ result[label] = [(terms[i], round(float(scores[i]), 4)) for i in top_idx if scores[i] > 0]
34
+ return result
35
+
36
+
37
+ def compute_ngrams(text: str, n: int = 2, top_n: int = 20) -> list[tuple]:
38
+ """
39
+ Return the most common n-grams (bigrams or trigrams) in text.
40
+
41
+ Returns:
42
+ [(ngram_string, count), ...]
43
+ """
44
+ words = re.findall(r'\b[a-z]{2,}\b', text.lower())
45
+ ngrams = zip(*[words[i:] for i in range(n)])
46
+ counts = Counter(" ".join(g) for g in ngrams)
47
+ return counts.most_common(top_n)
aize/analysis/vocab.py ADDED
@@ -0,0 +1,34 @@
1
+ """Vocabulary comparison across multiple texts (zero-frequency problem)."""
2
+
3
+
4
+ def compare_vocab(dict_a: dict, name_a: str, dict_b: dict, name_b: str) -> dict:
5
+ """
6
+ Compare vocabulary between two frequency dictionaries.
7
+
8
+ Returns:
9
+ {
10
+ "name_a": str, "size_a": int,
11
+ "name_b": str, "size_b": int,
12
+ "common": int,
13
+ "only_in_a": int,
14
+ "only_in_b": int,
15
+ "pct_a_missing_from_b": float,
16
+ }
17
+ """
18
+ set_a = set(dict_a.keys())
19
+ set_b = set(dict_b.keys())
20
+ common = set_a & set_b
21
+ only_a = set_a - set_b
22
+ only_b = set_b - set_a
23
+ pct_missing = round(len(only_a) / len(set_a) * 100, 2) if set_a else 0.0
24
+
25
+ return {
26
+ "name_a": name_a,
27
+ "size_a": len(set_a),
28
+ "name_b": name_b,
29
+ "size_b": len(set_b),
30
+ "common": len(common),
31
+ "only_in_a": len(only_a),
32
+ "only_in_b": len(only_b),
33
+ "pct_a_missing_from_b": pct_missing,
34
+ }
@@ -0,0 +1,30 @@
1
+ """Word cloud image generation."""
2
+ import re
3
+ import io
4
+
5
+
6
+ def generate_wordcloud(text: str, width: int = 800, height: int = 400,
7
+ background_color: str = "white") -> bytes:
8
+ """
9
+ Generate a word cloud image from text.
10
+
11
+ Returns:
12
+ PNG image as bytes (ready for Streamlit st.image or API response).
13
+ """
14
+ from wordcloud import WordCloud
15
+
16
+ # Clean text
17
+ clean = re.sub(r'[^a-zA-Z\s]', ' ', text.lower())
18
+ wc = WordCloud(
19
+ width=width,
20
+ height=height,
21
+ background_color=background_color,
22
+ collocations=False,
23
+ max_words=200,
24
+ min_word_length=3,
25
+ ).generate(clean)
26
+
27
+ buf = io.BytesIO()
28
+ wc.to_image().save(buf, format="PNG")
29
+ buf.seek(0)
30
+ return buf.read()
aize/analysis/zipf.py ADDED
@@ -0,0 +1,40 @@
1
+ """Zipf's Law analysis — rank/frequency + hapax/dis legomena."""
2
+ import re
3
+
4
+
5
+ def analyze_zipf(text: str) -> dict:
6
+ """
7
+ Compute word frequency distribution and Zipf statistics.
8
+
9
+ Returns:
10
+ {
11
+ "frequency": {word: count, ...}, # sorted most→least frequent
12
+ "rank_freq": [(rank, count), ...], # for rank-frequency plot
13
+ "hapax_pct": float, # % words appearing once
14
+ "dis_pct": float, # % words appearing twice
15
+ "freq_gt2_pct": float, # % words appearing > 2 times
16
+ }
17
+ """
18
+ words = re.findall(r'\b[A-Za-z][a-z]{2,9}\b', text)
19
+ frequency: dict[str, int] = {}
20
+ for word in words:
21
+ frequency[word] = frequency.get(word, 0) + 1
22
+
23
+ total = len(frequency)
24
+ if total == 0:
25
+ return {"frequency": {}, "rank_freq": [], "hapax_pct": 0, "dis_pct": 0, "freq_gt2_pct": 0}
26
+
27
+ hapax = sum(1 for c in frequency.values() if c == 1)
28
+ dis = sum(1 for c in frequency.values() if c == 2)
29
+ gt2 = total - hapax - dis
30
+
31
+ sorted_freq = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True))
32
+ rank_freq = list(enumerate(sorted_freq.values(), start=1))
33
+
34
+ return {
35
+ "frequency": sorted_freq,
36
+ "rank_freq": rank_freq,
37
+ "hapax_pct": round(hapax / total * 100, 2),
38
+ "dis_pct": round(dis / total * 100, 2),
39
+ "freq_gt2_pct": round(gt2 / total * 100, 2),
40
+ }
@@ -0,0 +1,433 @@
1
+ Metadata-Version: 2.4
2
+ Name: aize
3
+ Version: 0.1.0
4
+ Summary: aize — lightweight NLP analysis toolkit (Zipf, Heap's law, TF-IDF, sentiment, readability & more)
5
+ Author: eokoaze
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/eokoaze/aize
8
+ Project-URL: Repository, https://github.com/eokoaze/aize
9
+ Project-URL: Bug Tracker, https://github.com/eokoaze/aize/issues
10
+ Keywords: nlp,natural-language-processing,text-analysis,zipf,tfidf,sentiment,readability,wordcloud
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Text Processing :: Linguistic
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: nltk>=3.8
25
+ Requires-Dist: scikit-learn>=1.2
26
+ Requires-Dist: wordcloud>=1.9
27
+ Requires-Dist: pandas>=1.5
28
+ Provides-Extra: dashboard
29
+ Requires-Dist: streamlit>=1.28; extra == "dashboard"
30
+ Requires-Dist: plotly>=5.0; extra == "dashboard"
31
+ Requires-Dist: Pillow>=9.0; extra == "dashboard"
32
+ Provides-Extra: api
33
+ Requires-Dist: fastapi>=0.100; extra == "api"
34
+ Requires-Dist: uvicorn>=0.23; extra == "api"
35
+ Requires-Dist: python-multipart>=0.0.6; extra == "api"
36
+ Provides-Extra: all
37
+ Requires-Dist: aize[dashboard]; extra == "all"
38
+ Requires-Dist: aize[api]; extra == "all"
39
+ Provides-Extra: dev
40
+ Requires-Dist: aize[all]; extra == "dev"
41
+ Requires-Dist: build>=1.0; extra == "dev"
42
+ Requires-Dist: twine>=5.0; extra == "dev"
43
+ Dynamic: license-file
44
+
45
+ # aize · NLP Analysis Toolkit
46
+
47
+ [![PyPI version](https://img.shields.io/pypi/v/aize.svg)](https://pypi.org/project/aize/)
48
+ [![Python](https://img.shields.io/pypi/pyversions/aize.svg)](https://pypi.org/project/aize/)
49
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
50
+
51
+ > A lightweight, pip-installable Python library for deep text analysis — covering everything from Zipf's law to sentiment, readability, TF-IDF, and more. Comes with a Streamlit dashboard and a FastAPI backend out of the box.
52
+
53
+ ---
54
+
55
+ ## Table of Contents
56
+
57
+ - [Features](#features)
58
+ - [Installation](#installation)
59
+ - [Quick Start](#quick-start)
60
+ - [Module Reference](#module-reference)
61
+ - [compute_stats](#compute_stats)
62
+ - [analyze_groupwords](#analyze_groupwords)
63
+ - [analyze_zipf](#analyze_zipf)
64
+ - [analyze_heaps](#analyze_heaps)
65
+ - [calculate_density](#calculate_density)
66
+ - [compare_vocab](#compare_vocab)
67
+ - [compute_tfidf](#compute_tfidf)
68
+ - [compute_ngrams](#compute_ngrams)
69
+ - [analyze_sentiment](#analyze_sentiment)
70
+ - [compute_readability](#compute_readability)
71
+ - [analyze_pos](#analyze_pos)
72
+ - [generate_wordcloud](#generate_wordcloud)
73
+ - [Streamlit Dashboard](#streamlit-dashboard)
74
+ - [FastAPI Backend](#fastapi-backend)
75
+ - [Dependencies](#dependencies)
76
+ - [Project Structure](#project-structure)
77
+ - [License](#license)
78
+
79
+ ---
80
+
81
+ ## Features
82
+
83
+ | Category | Capability |
84
+ |---|---|
85
+ | 📊 **Statistics** | Word count, unique words, avg word length, sentence count |
86
+ | 📏 **Word Grouping** | Frequency distribution grouped by word length |
87
+ | 📉 **Zipf's Law** | Rank-frequency distribution, hapax & dis legomena percentages |
88
+ | 📈 **Heap's Law** | Vocabulary growth curve as corpus size increases |
89
+ | 🚫 **Stopwords** | Stopword density analysis |
90
+ | 🔤 **Vocabulary** | Side-by-side vocabulary comparison across multiple texts |
91
+ | 🔍 **TF-IDF** | Top keyword extraction per document in a corpus |
92
+ | 🔗 **N-grams** | Most common bigrams and trigrams |
93
+ | 💬 **Sentiment** | VADER-based positive / negative / neutral / compound scoring |
94
+ | 📖 **Readability** | Flesch Reading Ease & Flesch-Kincaid Grade Level |
95
+ | 🏷️ **POS Tagging** | Part-of-speech frequency breakdown |
96
+ | ☁️ **Word Cloud** | Generates word cloud images from any text |
97
+ | 🖥️ **Dashboard** | Interactive Streamlit UI for all analyses |
98
+ | ⚡ **API** | FastAPI REST backend for programmatic access |
99
+
100
+ ---
101
+
102
+ ## Installation
103
+
104
+ ### Core library
105
+
106
+ ```bash
107
+ pip install aize
108
+ ```
109
+
110
+ ### With the Streamlit dashboard
111
+
112
+ ```bash
113
+ pip install aize[dashboard]
114
+ ```
115
+
116
+ ### With the FastAPI backend
117
+
118
+ ```bash
119
+ pip install aize[api]
120
+ ```
121
+
122
+ ### Everything (dashboard + API)
123
+
124
+ ```bash
125
+ pip install aize[all]
126
+ ```
127
+
128
+ ### From source (development)
129
+
130
+ ```bash
131
+ git clone https://github.com/eokoaze/aize.git
132
+ cd aize
133
+ pip install -e .[all]
134
+ ```
135
+
136
+ > **Python 3.9+** is required.
137
+
138
+ ---
139
+
140
+ ## Quick Start
141
+
142
+ ```python
143
+ import aize
144
+
145
+ text = """
146
+ Natural language processing is a subfield of linguistics and artificial intelligence.
147
+ It is primarily concerned with giving computers the ability to understand text and speech.
148
+ """
149
+
150
+ # Basic stats
151
+ print(aize.compute_stats(text))
152
+
153
+ # Sentiment
154
+ print(aize.analyze_sentiment(text))
155
+
156
+ # Readability
157
+ print(aize.compute_readability(text))
158
+
159
+ # Zipf's Law
160
+ print(aize.analyze_zipf(text))
161
+ ```
162
+
163
+ ---
164
+
165
+ ## Module Reference
166
+
167
+ ### `compute_stats`
168
+
169
+ ```python
170
+ from aize import compute_stats
171
+
172
+ result = compute_stats(text)
173
+ ```
174
+
175
+ Returns basic corpus statistics.
176
+
177
+ | Key | Type | Description |
178
+ |---|---|---|
179
+ | `word_count` | `int` | Total number of words |
180
+ | `unique_words` | `int` | Number of distinct words |
181
+ | `avg_word_length` | `float` | Average characters per word |
182
+ | `sentence_count` | `int` | Number of sentences |
183
+
184
+ ---
185
+
186
+ ### `analyze_groupwords`
187
+
188
+ ```python
189
+ from aize import analyze_groupwords
190
+
191
+ result = analyze_groupwords(text)
192
+ ```
193
+
194
+ Groups words by their character length and returns frequency counts per length bucket.
195
+
196
+ ---
197
+
198
+ ### `analyze_zipf`
199
+
200
+ ```python
201
+ from aize import analyze_zipf
202
+
203
+ result = analyze_zipf(text)
204
+ ```
205
+
206
+ Computes Zipf's Law statistics over the text.
207
+
208
+ | Key | Type | Description |
209
+ |---|---|---|
210
+ | `frequency` | `dict` | `{word: count}` sorted most → least frequent |
211
+ | `rank_freq` | `list[tuple]` | `[(rank, count)]` for rank-frequency plotting |
212
+ | `hapax_pct` | `float` | % of words appearing exactly once |
213
+ | `dis_pct` | `float` | % of words appearing exactly twice |
214
+ | `freq_gt2_pct` | `float` | % of words appearing more than twice |
215
+
216
+ ---
217
+
218
+ ### `analyze_heaps`
219
+
220
+ ```python
221
+ from aize import analyze_heaps
222
+
223
+ result = analyze_heaps(text)
224
+ ```
225
+
226
+ Returns a vocabulary growth curve (Heap's Law). Useful for visualising how the vocabulary expands as more text is read.
227
+
228
+ ---
229
+
230
+ ### `calculate_density`
231
+
232
+ ```python
233
+ from aize import calculate_density
234
+
235
+ result = calculate_density(text)
236
+ ```
237
+
238
+ Calculates the proportion of stopwords in the text, returning a stopword density percentage and associated word lists.
239
+
240
+ ---
241
+
242
+ ### `compare_vocab`
243
+
244
+ ```python
245
+ from aize import compare_vocab
246
+
247
+ result = compare_vocab({"doc1": text1, "doc2": text2})
248
+ ```
249
+
250
+ Compares vocabulary across multiple documents — unique words per document, shared vocabulary, and overlap statistics.
251
+
252
+ ---
253
+
254
+ ### `compute_tfidf`
255
+
256
+ ```python
257
+ from aize import compute_tfidf
258
+
259
+ result = compute_tfidf(
260
+ texts=["text of doc1...", "text of doc2..."],
261
+ labels=["doc1", "doc2"],
262
+ top_n=15
263
+ )
264
+ # Returns: {"doc1": [("word", score), ...], "doc2": [...]}
265
+ ```
266
+
267
+ Extracts the top `n` TF-IDF keywords for each document in a corpus. Uses scikit-learn under the hood with English stopword filtering.
268
+
269
+ ---
270
+
271
+ ### `compute_ngrams`
272
+
273
+ ```python
274
+ from aize import compute_ngrams
275
+
276
+ bigrams = compute_ngrams(text, n=2, top_n=20)
277
+ trigrams = compute_ngrams(text, n=3, top_n=20)
278
+ # Returns: [("phrase here", count), ...]
279
+ ```
280
+
281
+ Returns the most frequent n-grams (bigrams, trigrams, etc.) from the text.
282
+
283
+ ---
284
+
285
+ ### `analyze_sentiment`
286
+
287
+ ```python
288
+ from aize import analyze_sentiment
289
+
290
+ result = analyze_sentiment(text)
291
+ ```
292
+
293
+ Runs VADER sentiment analysis. NLTK's `vader_lexicon` is auto-downloaded on first use.
294
+
295
+ | Key | Type | Description |
296
+ |---|---|---|
297
+ | `positive` | `float` | Proportion of positive sentiment |
298
+ | `negative` | `float` | Proportion of negative sentiment |
299
+ | `neutral` | `float` | Proportion of neutral sentiment |
300
+ | `compound` | `float` | Overall score from `-1.0` (most negative) to `+1.0` (most positive) |
301
+ | `label` | `str` | `"Positive"`, `"Negative"`, or `"Neutral"` |
302
+
303
+ ---
304
+
305
+ ### `compute_readability`
306
+
307
+ ```python
308
+ from aize import compute_readability
309
+
310
+ result = compute_readability(text)
311
+ ```
312
+
313
+ Computes Flesch-Kincaid readability metrics.
314
+
315
+ | Key | Type | Description |
316
+ |---|---|---|
317
+ | `flesch_reading_ease` | `float` | 0–100 score; higher = easier to read |
318
+ | `fk_grade_level` | `float` | Approximate US school grade level |
319
+ | `sentences` | `int` | Sentence count |
320
+ | `words` | `int` | Word count |
321
+ | `syllables` | `int` | Total syllables |
322
+ | `interpretation` | `str` | `"Very Easy"` → `"Very Confusing"` |
323
+
324
+ ---
325
+
326
+ ### `analyze_pos`
327
+
328
+ ```python
329
+ from aize import analyze_pos
330
+
331
+ result = analyze_pos(text)
332
+ ```
333
+
334
+ Returns a part-of-speech frequency breakdown (nouns, verbs, adjectives, adverbs, etc.) using NLTK's POS tagger.
335
+
336
+ ---
337
+
338
+ ### `generate_wordcloud`
339
+
340
+ ```python
341
+ from aize import generate_wordcloud
342
+
343
+ image = generate_wordcloud(text)
344
+ ```
345
+
346
+ Generates a word cloud image from the input text. Returns a PIL `Image` object that can be displayed or saved.
347
+
348
+ ```python
349
+ image.save("wordcloud.png")
350
+ ```
351
+
352
+ ---
353
+
354
+ ## Streamlit Dashboard
355
+
356
+ An interactive, browser-based UI for all analyses is included.
357
+
358
+ ```bash
359
+ streamlit run nlp_dashboard.py
360
+ ```
361
+
362
+ The dashboard lets you upload one or more `.txt` files and interactively explore all analysis modules with charts and tables powered by Plotly.
363
+
364
+ ---
365
+
366
+ ## FastAPI Backend
367
+
368
+ A REST API is included for programmatic or remote access to the toolkit.
369
+
370
+ ```bash
371
+ uvicorn api:app --reload
372
+ ```
373
+
374
+ The API will be available at `http://127.0.0.1:8000`. Interactive docs are auto-generated at:
375
+
376
+ - **Swagger UI**: `http://127.0.0.1:8000/docs`
377
+ - **ReDoc**: `http://127.0.0.1:8000/redoc`
378
+
379
+ ---
380
+
381
+ ## Dependencies
382
+
383
+ | Package | Purpose |
384
+ |---|---|
385
+ | `nltk >= 3.8` | Tokenisation, POS tagging, VADER sentiment |
386
+ | `scikit-learn >= 1.2` | TF-IDF vectorisation |
387
+ | `wordcloud >= 1.9` | Word cloud image generation |
388
+ | `pandas >= 1.5` | Data manipulation |
389
+ | `plotly >= 5.0` | Interactive charts in the dashboard |
390
+ | `streamlit >= 1.28` | Web dashboard UI |
391
+ | `fastapi >= 0.100` | REST API framework |
392
+ | `uvicorn >= 0.23` | ASGI server for FastAPI |
393
+ | `python-multipart >= 0.0.6` | File upload support for FastAPI |
394
+
395
+ ---
396
+
397
+ ## Project Structure
398
+
399
+ ```
400
+ aize/
401
+ ├── aize/ # Core library package
402
+ │ ├── __init__.py # Public API surface
403
+ │ └── analysis/
404
+ │ ├── stats.py # Basic text statistics
405
+ │ ├── groupwords.py # Word length grouping
406
+ │ ├── zipf.py # Zipf's law analysis
407
+ │ ├── heaps.py # Heap's law analysis
408
+ │ ├── stopwords.py # Stopword density
409
+ │ ├── vocab.py # Vocabulary comparison
410
+ │ ├── tfidf.py # TF-IDF & n-grams
411
+ │ ├── sentiment.py # VADER sentiment
412
+ │ ├── readability.py # Flesch-Kincaid scores
413
+ │ ├── pos.py # POS tagging
414
+ │ └── wordcloud_gen.py # Word cloud generation
415
+ ├── .github/workflows/
416
+ │ └── publish.yml # Auto-publish to PyPI on version tags
417
+ ├── nlp_dashboard.py # Streamlit dashboard
418
+ ├── api.py # FastAPI REST backend
419
+ ├── pyproject.toml # Package config & dependency extras
420
+ ├── MANIFEST.in # Source distribution file rules
421
+ ├── requirements.txt # All-inclusive dev requirements
422
+ └── README.md
423
+ ```
424
+
425
+ ---
426
+
427
+ ## License
428
+
429
+ This project is licensed under the **MIT License**. See [LICENSE](LICENSE) for details.
430
+
431
+ ---
432
+
433
+ <p align="center">Built with ❤️ using Python, NLTK, scikit-learn, Streamlit & FastAPI</p>
@@ -0,0 +1,18 @@
1
+ aize/__init__.py,sha256=ZlMvLCbJt-zqYUn3h8Zcz1qYP8GPugXvjzQWuEMhnAU,975
2
+ aize/analysis/__init__.py,sha256=f_6TvDw1XmsI3Cq2feqva-mVjt-WSRzrzVVxSh8BgSQ,28
3
+ aize/analysis/groupwords.py,sha256=hNppbDPGPrdrwcVdWLrQq0IlIC4kuwhppY43WQWo_yU,604
4
+ aize/analysis/heaps.py,sha256=8i9APYJxP6Y5lPtX38E6-9W_djII0fD5Q-XzXuqFSag,1145
5
+ aize/analysis/pos.py,sha256=1m5K8_P5ZDJ0h_7743T-iX9UQ9y4kUD-fLMJWER417U,1792
6
+ aize/analysis/readability.py,sha256=V9ON2lE4yX5jZWrzaKAcvazeJ1N_R4EF2SDDw5Fesvg,1722
7
+ aize/analysis/sentiment.py,sha256=WQZQNbVn4n_207TQ_KmfLynQ3nnKDcrG__16i7jM30c,1133
8
+ aize/analysis/stats.py,sha256=tygPGXhL8564b2Pgn6VdVoiGxJtEGAWco4TFiqvbrp4,413
9
+ aize/analysis/stopwords.py,sha256=v-aEx6-ci0lZHfwATUn6rhdL5IXEyWS2vIDy7Bs_diI,1186
10
+ aize/analysis/tfidf.py,sha256=oFcyF00_WtPuUB60PwfiMdGzDKTeAN1wdAAiTPHFjIk,1507
11
+ aize/analysis/vocab.py,sha256=GJrFuSeKJ6NfHPorb4bO_75nfF-K2NMf5XvTwHQLeBQ,966
12
+ aize/analysis/wordcloud_gen.py,sha256=KrXl_Vm7nqNh007N6StRDgKSUbjzMZH9qzRwRetO2zE,761
13
+ aize/analysis/zipf.py,sha256=QJqdKi-AsFZF0GeqoO6fdXBmCgfjuA6tVa_cikC3oEo,1459
14
+ aize-0.1.0.dist-info/licenses/LICENSE,sha256=Sf3W1N-FIZ8i8VKRAO2YYEfzc0wW8x8gd7nWh5U28LE,1072
15
+ aize-0.1.0.dist-info/METADATA,sha256=lg-Knwd7wSmyA560UjaNXXOzdnhR6cLl6W2j8-uFk4s,11911
16
+ aize-0.1.0.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
17
+ aize-0.1.0.dist-info/top_level.txt,sha256=NjP_3oz1jbCAgzzxjRFStmIGJkZ0-oZ_7pQ6LjPsMi0,5
18
+ aize-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Emmanuel Okoaze
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ aize