PyPI - aize - Versions diffs - 0.1.0__py3-none-any.whl - Mend

aize 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

aize/__init__.py +32 -0
aize/analysis/__init__.py +1 -0
aize/analysis/groupwords.py +21 -0
aize/analysis/heaps.py +40 -0
aize/analysis/pos.py +59 -0
aize/analysis/readability.py +59 -0
aize/analysis/sentiment.py +40 -0
aize/analysis/stats.py +16 -0
aize/analysis/stopwords.py +40 -0
aize/analysis/tfidf.py +47 -0
aize/analysis/vocab.py +34 -0
aize/analysis/wordcloud_gen.py +30 -0
aize/analysis/zipf.py +40 -0
aize-0.1.0.dist-info/METADATA +433 -0
aize-0.1.0.dist-info/RECORD +18 -0
aize-0.1.0.dist-info/WHEEL +5 -0
aize-0.1.0.dist-info/licenses/LICENSE +21 -0
aize-0.1.0.dist-info/top_level.txt +1 -0

aize/__init__.py ADDED Viewed

@@ -0,0 +1,32 @@
+"""
+aize — NLP Analysis Toolkit
+A lightweight, pip-installable library for text analysis.
+"""
+from aize.analysis.stats import compute_stats
+from aize.analysis.groupwords import analyze_groupwords
+from aize.analysis.zipf import analyze_zipf
+from aize.analysis.heaps import analyze_heaps
+from aize.analysis.stopwords import calculate_density
+from aize.analysis.vocab import compare_vocab
+from aize.analysis.tfidf import compute_tfidf, compute_ngrams
+from aize.analysis.sentiment import analyze_sentiment
+from aize.analysis.readability import compute_readability
+from aize.analysis.pos import analyze_pos
+from aize.analysis.wordcloud_gen import generate_wordcloud
+__version__ = "0.1.0"
+__all__ = [
+    "compute_stats",
+    "analyze_groupwords",
+    "analyze_zipf",
+    "analyze_heaps",
+    "calculate_density",
+    "compare_vocab",
+    "compute_tfidf",
+    "compute_ngrams",
+    "analyze_sentiment",
+    "compute_readability",
+    "analyze_pos",
+    "generate_wordcloud",
+]

aize/analysis/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # aize analysis sub-package

aize/analysis/groupwords.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Word-length (groupwords) distribution analysis."""
+import re
+def analyze_groupwords(text: str) -> dict:
+    """
+    Group unique words by their character length.
+    Returns a dict: {length: count_of_unique_words}
+    """
+    groups: dict[int, set] = {}
+    words = text.split()
+    for word in words:
+        word = word.lower().strip('.,!?"():;[]#«»')
+        if not word:
+            continue
+        size = len(word)
+        if size not in groups:
+            groups[size] = set()
+        groups[size].add(word)
+    return {size: len(words_set) for size, words_set in sorted(groups.items())}

aize/analysis/heaps.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""Heap's Law — vocabulary growth (types vs tokens)."""
+import re
+def analyze_heaps(text: str, sample_every: int = 100) -> dict:
+    """
+    Compute vocabulary growth as tokens are consumed.
+    Returns:
+        {
+          "tokens": [int, ...],   # token counts sampled every `sample_every`
+          "types":  [int, ...],   # unique word counts at each sample
+          "total_tokens": int,
+          "total_types":  int,
+          "diversity_pct": float, # types/tokens * 100
+        }
+    """
+    words = re.findall(r'\b[A-Za-z][a-z]{2,9}\b', text)
+    frequency: dict[str, int] = {}
+    x, y = [], []
+    types = tokens = 0
+    for word in words:
+        tokens += 1
+        word = word.lower()
+        if frequency.get(word, 0) == 0:
+            types += 1
+        frequency[word] = frequency.get(word, 0) + 1
+        if tokens % sample_every == 0:
+            x.append(tokens)
+            y.append(types)
+    diversity = round(types / tokens * 100, 2) if tokens > 0 else 0
+    return {
+        "tokens": x,
+        "types": y,
+        "total_tokens": tokens,
+        "total_types": types,
+        "diversity_pct": diversity,
+    }

aize/analysis/pos.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""Part-of-speech tag distribution using NLTK."""
+import re
+import nltk
+# Support both old (averaged_perceptron_tagger) and new (averaged_perceptron_tagger_eng)
+# NLTK versions — download whichever is missing.
+for _resource in ("averaged_perceptron_tagger", "averaged_perceptron_tagger_eng"):
+    try:
+        nltk.data.find(f"taggers/{_resource}")
+    except LookupError:
+        nltk.download(_resource, quiet=True)
+for _resource in ("punkt", "punkt_tab"):
+    try:
+        nltk.data.find(f"tokenizers/{_resource}")
+    except LookupError:
+        nltk.download(_resource, quiet=True)
+# POS tag groups (Penn Treebank)
+_TAG_GROUPS = {
+    "Noun":         {"NN", "NNS", "NNP", "NNPS"},
+    "Verb":         {"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"},
+    "Adjective":    {"JJ", "JJR", "JJS"},
+    "Adverb":       {"RB", "RBR", "RBS"},
+    "Pronoun":      {"PRP", "PRP$", "WP", "WP$"},
+    "Determiner":   {"DT"},
+    "Preposition":  {"IN"},
+    "Conjunction":  {"CC"},
+    "Other":        set(),
+}
+def analyze_pos(text: str) -> dict:
+    """
+    Return grouped POS tag counts.
+    Returns:
+        {"Noun": int, "Verb": int, "Adjective": int, ...}
+    """
+    # Work on a sample to keep it fast for large files
+    sample = text[:10_000]  # POS quality plateaus well before 10k chars; cap for speed
+    try:
+        tokens = nltk.word_tokenize(sample)
+    except Exception:
+        tokens = sample.split()
+    tagged = nltk.pos_tag(tokens)
+    counts = {group: 0 for group in _TAG_GROUPS}
+    for _, tag in tagged:
+        matched = False
+        for group, tag_set in _TAG_GROUPS.items():
+            if tag in tag_set:
+                counts[group] += 1
+                matched = True
+                break
+        if not matched:
+            counts["Other"] += 1
+    return counts

aize/analysis/readability.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""Flesch-Kincaid readability scores."""
+import re
+def compute_readability(text: str) -> dict:
+    """
+    Compute Flesch Reading Ease and Flesch-Kincaid Grade Level.
+    Returns:
+        {
+          "flesch_reading_ease": float,    # 0-100, higher = easier
+          "fk_grade_level": float,         # US school grade level
+          "sentences": int,
+          "words": int,
+          "syllables": int,
+          "interpretation": str,
+        }
+    """
+    sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
+    words = re.findall(r'\b[a-zA-Z]+\b', text)
+    num_sentences = max(len(sentences), 1)
+    num_words = max(len(words), 1)
+    def count_syllables(word: str) -> int:
+        word = word.lower()
+        count = len(re.findall(r'[aeiouy]+', word))
+        if word.endswith('e') and count > 1:
+            count -= 1
+        return max(count, 1)
+    num_syllables = sum(count_syllables(w) for w in words)
+    asl = num_words / num_sentences          # avg sentence length
+    asw = num_syllables / num_words          # avg syllables per word
+    fre = 206.835 - (1.015 * asl) - (84.6 * asw)
+    fkgl = (0.39 * asl) + (11.8 * asw) - 15.59
+    if fre >= 90:
+        interp = "Very Easy"
+    elif fre >= 70:
+        interp = "Easy"
+    elif fre >= 60:
+        interp = "Standard"
+    elif fre >= 50:
+        interp = "Fairly Difficult"
+    elif fre >= 30:
+        interp = "Difficult"
+    else:
+        interp = "Very Confusing"
+    return {
+        "flesch_reading_ease": round(fre, 2),
+        "fk_grade_level":      round(fkgl, 2),
+        "sentences":   num_sentences,
+        "words":       num_words,
+        "syllables":   num_syllables,
+        "interpretation": interp,
+    }

aize/analysis/sentiment.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""VADER sentiment analysis."""
+import nltk
+try:
+    from nltk.sentiment.vader import SentimentIntensityAnalyzer
+    _sid = SentimentIntensityAnalyzer()
+except LookupError:
+    nltk.download("vader_lexicon", quiet=True)
+    from nltk.sentiment.vader import SentimentIntensityAnalyzer
+    _sid = SentimentIntensityAnalyzer()
+def analyze_sentiment(text: str) -> dict:
+    """
+    Run VADER sentiment analysis on the given text.
+    Returns:
+        {
+          "positive": float,
+          "negative": float,
+          "neutral":  float,
+          "compound": float,   # -1 (most negative) to +1 (most positive)
+          "label": str,        # "Positive" | "Negative" | "Neutral"
+        }
+    """
+    scores = _sid.polarity_scores(text)
+    if scores["compound"] >= 0.05:
+        label = "Positive"
+    elif scores["compound"] <= -0.05:
+        label = "Negative"
+    else:
+        label = "Neutral"
+    return {
+        "positive": round(scores["pos"], 4),
+        "negative": round(scores["neg"], 4),
+        "neutral":  round(scores["neu"], 4),
+        "compound": round(scores["compound"], 4),
+        "label": label,
+    }

aize/analysis/stats.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Word/line/char/space statistics for a text."""
+import re
+def compute_stats(text: str) -> dict:
+    """Return basic text statistics."""
+    lines = text.splitlines()
+    words = text.split()
+    chars = sum(1 for c in text if c not in (" ", "\n"))
+    spaces = text.count(" ")
+    return {
+        "lines": len(lines),
+        "words": len(words),
+        "characters": chars,
+        "spaces": spaces,
+    }

aize/analysis/stopwords.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""Stop-word density analysis using NLTK stopwords."""
+import re
+import nltk
+try:
+    from nltk.corpus import stopwords as nltk_sw
+    _en = set(nltk_sw.words("english"))
+    _es = set(nltk_sw.words("spanish"))
+except LookupError:
+    nltk.download("stopwords", quiet=True)
+    from nltk.corpus import stopwords as nltk_sw
+    _en = set(nltk_sw.words("english"))
+    _es = set(nltk_sw.words("spanish"))
+STOPWORD_SETS = {"english": _en, "spanish": _es}
+def calculate_density(text: str, language: str = "english") -> dict:
+    """
+    Compute stop-word density for the given text.
+    Returns:
+        {
+          "total_words": int,
+          "stop_words":  int,
+          "density_pct": float,
+          "language": str,
+        }
+    """
+    sw_set = STOPWORD_SETS.get(language, _en)
+    words = re.findall(r'\b\w+\b', text.lower())
+    if not words:
+        return {"total_words": 0, "stop_words": 0, "density_pct": 0.0, "language": language}
+    stop_count = sum(1 for w in words if w in sw_set)
+    return {
+        "total_words": len(words),
+        "stop_words": stop_count,
+        "density_pct": round(stop_count / len(words) * 100, 4),
+        "language": language,
+    }

aize/analysis/tfidf.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""TF-IDF keyword extraction and N-gram frequency analysis."""
+import re
+from collections import Counter
+from sklearn.feature_extraction.text import TfidfVectorizer
+def compute_tfidf(texts: list[str], labels: list[str], top_n: int = 15) -> dict:
+    """
+    Compute TF-IDF scores across a corpus of texts.
+    Args:
+        texts:  list of raw text strings
+        labels: matching list of file/document names
+        top_n:  number of top keywords to return per document
+    Returns:
+        {label: [(term, score), ...], ...}
+    """
+    if not texts:
+        return {}
+    vec = TfidfVectorizer(stop_words="english", max_features=5000,
+                          token_pattern=r'\b[a-z]{3,}\b')
+    try:
+        matrix = vec.fit_transform(texts)
+    except ValueError:
+        return {}
+    terms = vec.get_feature_names_out()
+    result = {}
+    for idx, label in enumerate(labels):
+        scores = matrix[idx].toarray().flatten()
+        top_idx = scores.argsort()[::-1][:top_n]
+        result[label] = [(terms[i], round(float(scores[i]), 4)) for i in top_idx if scores[i] > 0]
+    return result
+def compute_ngrams(text: str, n: int = 2, top_n: int = 20) -> list[tuple]:
+    """
+    Return the most common n-grams (bigrams or trigrams) in text.
+    Returns:
+        [(ngram_string, count), ...]
+    """
+    words = re.findall(r'\b[a-z]{2,}\b', text.lower())
+    ngrams = zip(*[words[i:] for i in range(n)])
+    counts = Counter(" ".join(g) for g in ngrams)
+    return counts.most_common(top_n)

aize/analysis/vocab.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Vocabulary comparison across multiple texts (zero-frequency problem)."""
+def compare_vocab(dict_a: dict, name_a: str, dict_b: dict, name_b: str) -> dict:
+    """
+    Compare vocabulary between two frequency dictionaries.
+    Returns:
+        {
+          "name_a": str, "size_a": int,
+          "name_b": str, "size_b": int,
+          "common": int,
+          "only_in_a": int,
+          "only_in_b": int,
+          "pct_a_missing_from_b": float,
+        }
+    """
+    set_a = set(dict_a.keys())
+    set_b = set(dict_b.keys())
+    common = set_a & set_b
+    only_a = set_a - set_b
+    only_b = set_b - set_a
+    pct_missing = round(len(only_a) / len(set_a) * 100, 2) if set_a else 0.0
+    return {
+        "name_a": name_a,
+        "size_a": len(set_a),
+        "name_b": name_b,
+        "size_b": len(set_b),
+        "common": len(common),
+        "only_in_a": len(only_a),
+        "only_in_b": len(only_b),
+        "pct_a_missing_from_b": pct_missing,
+    }

aize/analysis/wordcloud_gen.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Word cloud image generation."""
+import re
+import io
+def generate_wordcloud(text: str, width: int = 800, height: int = 400,
+                       background_color: str = "white") -> bytes:
+    """
+    Generate a word cloud image from text.
+    Returns:
+        PNG image as bytes (ready for Streamlit st.image or API response).
+    """
+    from wordcloud import WordCloud
+    # Clean text
+    clean = re.sub(r'[^a-zA-Z\s]', ' ', text.lower())
+    wc = WordCloud(
+        width=width,
+        height=height,
+        background_color=background_color,
+        collocations=False,
+        max_words=200,
+        min_word_length=3,
+    ).generate(clean)
+    buf = io.BytesIO()
+    wc.to_image().save(buf, format="PNG")
+    buf.seek(0)
+    return buf.read()

aize/analysis/zipf.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""Zipf's Law analysis — rank/frequency + hapax/dis legomena."""
+import re
+def analyze_zipf(text: str) -> dict:
+    """
+    Compute word frequency distribution and Zipf statistics.
+    Returns:
+        {
+          "frequency": {word: count, ...},    # sorted most→least frequent
+          "rank_freq": [(rank, count), ...],   # for rank-frequency plot
+          "hapax_pct": float,                  # % words appearing once
+          "dis_pct":   float,                  # % words appearing twice
+          "freq_gt2_pct": float,               # % words appearing > 2 times
+        }
+    """
+    words = re.findall(r'\b[A-Za-z][a-z]{2,9}\b', text)
+    frequency: dict[str, int] = {}
+    for word in words:
+        frequency[word] = frequency.get(word, 0) + 1
+    total = len(frequency)
+    if total == 0:
+        return {"frequency": {}, "rank_freq": [], "hapax_pct": 0, "dis_pct": 0, "freq_gt2_pct": 0}
+    hapax = sum(1 for c in frequency.values() if c == 1)
+    dis   = sum(1 for c in frequency.values() if c == 2)
+    gt2   = total - hapax - dis
+    sorted_freq = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True))
+    rank_freq = list(enumerate(sorted_freq.values(), start=1))
+    return {
+        "frequency":   sorted_freq,
+        "rank_freq":   rank_freq,
+        "hapax_pct":   round(hapax / total * 100, 2),
+        "dis_pct":     round(dis   / total * 100, 2),
+        "freq_gt2_pct": round(gt2  / total * 100, 2),
+    }

aize-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,433 @@
+Metadata-Version: 2.4
+Name: aize
+Version: 0.1.0
+Summary: aize — lightweight NLP analysis toolkit (Zipf, Heap's law, TF-IDF, sentiment, readability & more)
+Author: eokoaze
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/eokoaze/aize
+Project-URL: Repository, https://github.com/eokoaze/aize
+Project-URL: Bug Tracker, https://github.com/eokoaze/aize/issues
+Keywords: nlp,natural-language-processing,text-analysis,zipf,tfidf,sentiment,readability,wordcloud
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Text Processing :: Linguistic
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: nltk>=3.8
+Requires-Dist: scikit-learn>=1.2
+Requires-Dist: wordcloud>=1.9
+Requires-Dist: pandas>=1.5
+Provides-Extra: dashboard
+Requires-Dist: streamlit>=1.28; extra == "dashboard"
+Requires-Dist: plotly>=5.0; extra == "dashboard"
+Requires-Dist: Pillow>=9.0; extra == "dashboard"
+Provides-Extra: api
+Requires-Dist: fastapi>=0.100; extra == "api"
+Requires-Dist: uvicorn>=0.23; extra == "api"
+Requires-Dist: python-multipart>=0.0.6; extra == "api"
+Provides-Extra: all
+Requires-Dist: aize[dashboard]; extra == "all"
+Requires-Dist: aize[api]; extra == "all"
+Provides-Extra: dev
+Requires-Dist: aize[all]; extra == "dev"
+Requires-Dist: build>=1.0; extra == "dev"
+Requires-Dist: twine>=5.0; extra == "dev"
+Dynamic: license-file
+# aize · NLP Analysis Toolkit
+[![PyPI version](https://img.shields.io/pypi/v/aize.svg)](https://pypi.org/project/aize/)
+[![Python](https://img.shields.io/pypi/pyversions/aize.svg)](https://pypi.org/project/aize/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+> A lightweight, pip-installable Python library for deep text analysis — covering everything from Zipf's law to sentiment, readability, TF-IDF, and more. Comes with a Streamlit dashboard and a FastAPI backend out of the box.
+---
+## Table of Contents
+- [Features](#features)
+- [Installation](#installation)
+- [Quick Start](#quick-start)
+- [Module Reference](#module-reference)
+  - [compute_stats](#compute_stats)
+  - [analyze_groupwords](#analyze_groupwords)
+  - [analyze_zipf](#analyze_zipf)
+  - [analyze_heaps](#analyze_heaps)
+  - [calculate_density](#calculate_density)
+  - [compare_vocab](#compare_vocab)
+  - [compute_tfidf](#compute_tfidf)
+  - [compute_ngrams](#compute_ngrams)
+  - [analyze_sentiment](#analyze_sentiment)
+  - [compute_readability](#compute_readability)
+  - [analyze_pos](#analyze_pos)
+  - [generate_wordcloud](#generate_wordcloud)
+- [Streamlit Dashboard](#streamlit-dashboard)
+- [FastAPI Backend](#fastapi-backend)
+- [Dependencies](#dependencies)
+- [Project Structure](#project-structure)
+- [License](#license)
+---
+## Features
+| Category | Capability |
+|---|---|
+| 📊 **Statistics** | Word count, unique words, avg word length, sentence count |
+| 📏 **Word Grouping** | Frequency distribution grouped by word length |
+| 📉 **Zipf's Law** | Rank-frequency distribution, hapax & dis legomena percentages |
+| 📈 **Heap's Law** | Vocabulary growth curve as corpus size increases |
+| 🚫 **Stopwords** | Stopword density analysis |
+| 🔤 **Vocabulary** | Side-by-side vocabulary comparison across multiple texts |
+| 🔍 **TF-IDF** | Top keyword extraction per document in a corpus |
+| 🔗 **N-grams** | Most common bigrams and trigrams |
+| 💬 **Sentiment** | VADER-based positive / negative / neutral / compound scoring |
+| 📖 **Readability** | Flesch Reading Ease & Flesch-Kincaid Grade Level |
+| 🏷️ **POS Tagging** | Part-of-speech frequency breakdown |
+| ☁️ **Word Cloud** | Generates word cloud images from any text |
+| 🖥️ **Dashboard** | Interactive Streamlit UI for all analyses |
+| ⚡ **API** | FastAPI REST backend for programmatic access |
+---
+## Installation
+### Core library
+```bash
+pip install aize
+```
+### With the Streamlit dashboard
+```bash
+pip install aize[dashboard]
+```
+### With the FastAPI backend
+```bash
+pip install aize[api]
+```
+### Everything (dashboard + API)
+```bash
+pip install aize[all]
+```
+### From source (development)
+```bash
+git clone https://github.com/eokoaze/aize.git
+cd aize
+pip install -e .[all]
+```
+> **Python 3.9+** is required.
+---
+## Quick Start
+```python
+import aize
+text = """
+Natural language processing is a subfield of linguistics and artificial intelligence.
+It is primarily concerned with giving computers the ability to understand text and speech.
+"""
+# Basic stats
+print(aize.compute_stats(text))
+# Sentiment
+print(aize.analyze_sentiment(text))
+# Readability
+print(aize.compute_readability(text))
+# Zipf's Law
+print(aize.analyze_zipf(text))
+```
+---
+## Module Reference
+### `compute_stats`
+```python
+from aize import compute_stats
+result = compute_stats(text)
+```
+Returns basic corpus statistics.
+| Key | Type | Description |
+|---|---|---|
+| `word_count` | `int` | Total number of words |
+| `unique_words` | `int` | Number of distinct words |
+| `avg_word_length` | `float` | Average characters per word |
+| `sentence_count` | `int` | Number of sentences |
+---
+### `analyze_groupwords`
+```python
+from aize import analyze_groupwords
+result = analyze_groupwords(text)
+```
+Groups words by their character length and returns frequency counts per length bucket.
+---
+### `analyze_zipf`
+```python
+from aize import analyze_zipf
+result = analyze_zipf(text)
+```
+Computes Zipf's Law statistics over the text.
+| Key | Type | Description |
+|---|---|---|
+| `frequency` | `dict` | `{word: count}` sorted most → least frequent |
+| `rank_freq` | `list[tuple]` | `[(rank, count)]` for rank-frequency plotting |
+| `hapax_pct` | `float` | % of words appearing exactly once |
+| `dis_pct` | `float` | % of words appearing exactly twice |
+| `freq_gt2_pct` | `float` | % of words appearing more than twice |
+---
+### `analyze_heaps`
+```python
+from aize import analyze_heaps
+result = analyze_heaps(text)
+```
+Returns a vocabulary growth curve (Heap's Law). Useful for visualising how the vocabulary expands as more text is read.
+---
+### `calculate_density`
+```python
+from aize import calculate_density
+result = calculate_density(text)
+```
+Calculates the proportion of stopwords in the text, returning a stopword density percentage and associated word lists.
+---
+### `compare_vocab`
+```python
+from aize import compare_vocab
+result = compare_vocab({"doc1": text1, "doc2": text2})
+```
+Compares vocabulary across multiple documents — unique words per document, shared vocabulary, and overlap statistics.
+---
+### `compute_tfidf`
+```python
+from aize import compute_tfidf
+result = compute_tfidf(
+    texts=["text of doc1...", "text of doc2..."],
+    labels=["doc1", "doc2"],
+    top_n=15
+)
+# Returns: {"doc1": [("word", score), ...], "doc2": [...]}
+```
+Extracts the top `n` TF-IDF keywords for each document in a corpus. Uses scikit-learn under the hood with English stopword filtering.
+---
+### `compute_ngrams`
+```python
+from aize import compute_ngrams
+bigrams  = compute_ngrams(text, n=2, top_n=20)
+trigrams = compute_ngrams(text, n=3, top_n=20)
+# Returns: [("phrase here", count), ...]
+```
+Returns the most frequent n-grams (bigrams, trigrams, etc.) from the text.
+---
+### `analyze_sentiment`
+```python
+from aize import analyze_sentiment
+result = analyze_sentiment(text)
+```
+Runs VADER sentiment analysis. NLTK's `vader_lexicon` is auto-downloaded on first use.
+| Key | Type | Description |
+|---|---|---|
+| `positive` | `float` | Proportion of positive sentiment |
+| `negative` | `float` | Proportion of negative sentiment |
+| `neutral` | `float` | Proportion of neutral sentiment |
+| `compound` | `float` | Overall score from `-1.0` (most negative) to `+1.0` (most positive) |
+| `label` | `str` | `"Positive"`, `"Negative"`, or `"Neutral"` |
+---
+### `compute_readability`
+```python
+from aize import compute_readability
+result = compute_readability(text)
+```
+Computes Flesch-Kincaid readability metrics.
+| Key | Type | Description |
+|---|---|---|
+| `flesch_reading_ease` | `float` | 0–100 score; higher = easier to read |
+| `fk_grade_level` | `float` | Approximate US school grade level |
+| `sentences` | `int` | Sentence count |
+| `words` | `int` | Word count |
+| `syllables` | `int` | Total syllables |
+| `interpretation` | `str` | `"Very Easy"` → `"Very Confusing"` |
+---
+### `analyze_pos`
+```python
+from aize import analyze_pos
+result = analyze_pos(text)
+```
+Returns a part-of-speech frequency breakdown (nouns, verbs, adjectives, adverbs, etc.) using NLTK's POS tagger.
+---
+### `generate_wordcloud`
+```python
+from aize import generate_wordcloud
+image = generate_wordcloud(text)
+```
+Generates a word cloud image from the input text. Returns a PIL `Image` object that can be displayed or saved.
+```python
+image.save("wordcloud.png")
+```
+---
+## Streamlit Dashboard
+An interactive, browser-based UI for all analyses is included.
+```bash
+streamlit run nlp_dashboard.py
+```
+The dashboard lets you upload one or more `.txt` files and interactively explore all analysis modules with charts and tables powered by Plotly.
+---
+## FastAPI Backend
+A REST API is included for programmatic or remote access to the toolkit.
+```bash
+uvicorn api:app --reload
+```
+The API will be available at `http://127.0.0.1:8000`. Interactive docs are auto-generated at:
+- **Swagger UI**: `http://127.0.0.1:8000/docs`
+- **ReDoc**: `http://127.0.0.1:8000/redoc`
+---
+## Dependencies
+| Package | Purpose |
+|---|---|
+| `nltk >= 3.8` | Tokenisation, POS tagging, VADER sentiment |
+| `scikit-learn >= 1.2` | TF-IDF vectorisation |
+| `wordcloud >= 1.9` | Word cloud image generation |
+| `pandas >= 1.5` | Data manipulation |
+| `plotly >= 5.0` | Interactive charts in the dashboard |
+| `streamlit >= 1.28` | Web dashboard UI |
+| `fastapi >= 0.100` | REST API framework |
+| `uvicorn >= 0.23` | ASGI server for FastAPI |
+| `python-multipart >= 0.0.6` | File upload support for FastAPI |
+---
+## Project Structure
+```
+aize/
+├── aize/                        # Core library package
+│   ├── __init__.py              # Public API surface
+│   └── analysis/
+│       ├── stats.py             # Basic text statistics
+│       ├── groupwords.py        # Word length grouping
+│       ├── zipf.py              # Zipf's law analysis
+│       ├── heaps.py             # Heap's law analysis
+│       ├── stopwords.py         # Stopword density
+│       ├── vocab.py             # Vocabulary comparison
+│       ├── tfidf.py             # TF-IDF & n-grams
+│       ├── sentiment.py         # VADER sentiment
+│       ├── readability.py       # Flesch-Kincaid scores
+│       ├── pos.py               # POS tagging
+│       └── wordcloud_gen.py     # Word cloud generation
+├── .github/workflows/
+│   └── publish.yml              # Auto-publish to PyPI on version tags
+├── nlp_dashboard.py             # Streamlit dashboard
+├── api.py                       # FastAPI REST backend
+├── pyproject.toml               # Package config & dependency extras
+├── MANIFEST.in                  # Source distribution file rules
+├── requirements.txt             # All-inclusive dev requirements
+└── README.md
+```
+---
+## License
+This project is licensed under the **MIT License**. See [LICENSE](LICENSE) for details.
+---
+<p align="center">Built with ❤️ using Python, NLTK, scikit-learn, Streamlit & FastAPI</p>

aize-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+aize/__init__.py,sha256=ZlMvLCbJt-zqYUn3h8Zcz1qYP8GPugXvjzQWuEMhnAU,975
+aize/analysis/__init__.py,sha256=f_6TvDw1XmsI3Cq2feqva-mVjt-WSRzrzVVxSh8BgSQ,28
+aize/analysis/groupwords.py,sha256=hNppbDPGPrdrwcVdWLrQq0IlIC4kuwhppY43WQWo_yU,604
+aize/analysis/heaps.py,sha256=8i9APYJxP6Y5lPtX38E6-9W_djII0fD5Q-XzXuqFSag,1145
+aize/analysis/pos.py,sha256=1m5K8_P5ZDJ0h_7743T-iX9UQ9y4kUD-fLMJWER417U,1792
+aize/analysis/readability.py,sha256=V9ON2lE4yX5jZWrzaKAcvazeJ1N_R4EF2SDDw5Fesvg,1722
+aize/analysis/sentiment.py,sha256=WQZQNbVn4n_207TQ_KmfLynQ3nnKDcrG__16i7jM30c,1133
+aize/analysis/stats.py,sha256=tygPGXhL8564b2Pgn6VdVoiGxJtEGAWco4TFiqvbrp4,413
+aize/analysis/stopwords.py,sha256=v-aEx6-ci0lZHfwATUn6rhdL5IXEyWS2vIDy7Bs_diI,1186
+aize/analysis/tfidf.py,sha256=oFcyF00_WtPuUB60PwfiMdGzDKTeAN1wdAAiTPHFjIk,1507
+aize/analysis/vocab.py,sha256=GJrFuSeKJ6NfHPorb4bO_75nfF-K2NMf5XvTwHQLeBQ,966
+aize/analysis/wordcloud_gen.py,sha256=KrXl_Vm7nqNh007N6StRDgKSUbjzMZH9qzRwRetO2zE,761
+aize/analysis/zipf.py,sha256=QJqdKi-AsFZF0GeqoO6fdXBmCgfjuA6tVa_cikC3oEo,1459
+aize-0.1.0.dist-info/licenses/LICENSE,sha256=Sf3W1N-FIZ8i8VKRAO2YYEfzc0wW8x8gd7nWh5U28LE,1072
+aize-0.1.0.dist-info/METADATA,sha256=lg-Knwd7wSmyA560UjaNXXOzdnhR6cLl6W2j8-uFk4s,11911
+aize-0.1.0.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
+aize-0.1.0.dist-info/top_level.txt,sha256=NjP_3oz1jbCAgzzxjRFStmIGJkZ0-oZ_7pQ6LjPsMi0,5
+aize-0.1.0.dist-info/RECORD,,

aize-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

aize-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Emmanuel Okoaze
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

aize-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ aize