suur-data 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
suur_data/__init__.py ADDED
@@ -0,0 +1,233 @@
1
+ """
2
+ suur_data
3
+ ========
4
+ A smart data ingestion and tokenization pipeline.
5
+
6
+ Python API usage:
7
+ from suur_data import suur_data
8
+
9
+ tokens = suur_data("https://example.com/article.html")
10
+ tokens = suur_data("my_corpus.pdf", topic="quantum computing")
11
+ tokens = suur_data("data.txt", tokenizer="custom", vocab_size=4000)
12
+
13
+ CLI usage:
14
+ suur_data fetch <source> [OPTIONS]
15
+
16
+ Options:
17
+ --topic TEXT Topic/subject to filter content by
18
+ --tokenizer [pretrained|custom]
19
+ --model TEXT Pretrained model name (default: gpt2)
20
+ --vocab-size INT BPE vocab size for custom mode (default: 8000)
21
+ --threshold FLOAT Relevance threshold 0.0–1.0 (default: 0.05)
22
+ --save-dir PATH Where to save tokenizer artifacts
23
+ --no-filter Skip the relevance filter
24
+ --verbose / --quiet
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import os
30
+ import sys
31
+ import json
32
+ import tempfile
33
+ from typing import List, Optional
34
+
35
+ import click
36
+
37
+ from .ingest import ingest
38
+ from .filter import filter_chunks
39
+ from .tokenizer import tokenize_pretrained, tokenize_custom
40
+
41
+
42
+ # ---------------------------------------------------------------------------
43
+ # Python API
44
+ # ---------------------------------------------------------------------------
45
+
46
+ def suur_data(
47
+ data_location: str,
48
+ topic: str = "",
49
+ tokenizer: str = "pretrained",
50
+ model: str = "gpt2",
51
+ vocab_size: int = 8000,
52
+ threshold: float = 0.05,
53
+ save_dir: Optional[str] = None,
54
+ no_filter: bool = False,
55
+ verbose: bool = True,
56
+ ) -> List[int]:
57
+ """
58
+ End-to-end pipeline: ingest → filter → tokenize.
59
+
60
+ Parameters
61
+ ----------
62
+ data_location : str
63
+ URL or local file path. Supports .txt, .pdf, .docx, .csv,
64
+ .json, .html, .htm, .epub, .md, .rst
65
+ topic : str
66
+ Subject/keyword for the neural relevance filter.
67
+ Leave empty to skip filtering.
68
+ tokenizer : str
69
+ "pretrained" (default) or "custom".
70
+ model : str
71
+ HuggingFace model name for pretrained mode (default: "gpt2").
72
+ vocab_size : int
73
+ BPE vocabulary size for custom mode (default: 8000).
74
+ threshold : float
75
+ Cosine similarity cutoff for relevance filter (default: 0.05).
76
+ save_dir : str | None
77
+ Directory to save tokenizer files. None = don't save.
78
+ no_filter : bool
79
+ If True, skip the relevance filter and tokenize everything.
80
+ verbose : bool
81
+ Print progress information.
82
+
83
+ Returns
84
+ -------
85
+ List[int]
86
+ List of integer token IDs.
87
+ """
88
+ tmp = tempfile.mkdtemp(prefix="suur_data_")
89
+
90
+ # --- Stage 1: Ingest ---
91
+ if verbose:
92
+ print(f"\n[suur_data] Stage 1 — Ingesting: {data_location}")
93
+ raw_text = ingest(data_location, tmp_dir=tmp)
94
+
95
+ # --- Stage 2: Filter ---
96
+ if no_filter or not topic.strip():
97
+ if verbose:
98
+ if no_filter:
99
+ print("[suur_data] Stage 2 — Filter: SKIPPED (--no-filter)")
100
+ else:
101
+ print("[suur_data] Stage 2 — Filter: SKIPPED (no topic given)")
102
+ filtered_text = raw_text
103
+ else:
104
+ if verbose:
105
+ print(f"[suur_data] Stage 2 — Neural Filter (topic: '{topic}')")
106
+ filtered_text, _ = filter_chunks(
107
+ raw_text,
108
+ topic=topic,
109
+ threshold=threshold,
110
+ verbose=verbose,
111
+ )
112
+
113
+ # --- Stage 3: Tokenize ---
114
+ if verbose:
115
+ print(f"[suur_data] Stage 3 — Tokenizing ({tokenizer} mode)")
116
+
117
+ if tokenizer == "custom":
118
+ tokens = tokenize_custom(filtered_text, vocab_size=vocab_size, save_dir=save_dir)
119
+ else:
120
+ tokens = tokenize_pretrained(filtered_text, model_name=model, save_dir=save_dir)
121
+
122
+ if verbose:
123
+ print(f"\n[suur_data] Done. Total tokens: {len(tokens):,}\n")
124
+
125
+ return tokens
126
+
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # CLI
130
+ # ---------------------------------------------------------------------------
131
+
132
+ @click.group()
133
+ @click.version_option("1.0.0", prog_name="suur-data")
134
+ def cli():
135
+ """
136
+ \b
137
+ Suur Data — intelligent data ingestion and tokenization pipeline.
138
+
139
+ Fetch any text source, filter it by topic using TF-IDF relevance
140
+ scoring, then tokenize with a pretrained or custom BPE tokenizer.
141
+ """
142
+
143
+
144
+ @cli.command("fetch")
145
+ @click.argument("source")
146
+ @click.option("--topic", default="", show_default=True, help="Topic for relevance filtering.")
147
+ @click.option("--tokenizer", default="pretrained", show_default=True, type=click.Choice(["pretrained", "custom"]), help="Tokenizer mode.")
148
+ @click.option("--model", default="gpt2", show_default=True, help="Pretrained model name (HuggingFace).")
149
+ @click.option("--vocab-size", default=8000, show_default=True, help="BPE vocab size for custom tokenizer.")
150
+ @click.option("--threshold", default=0.05, show_default=True, help="Relevance filter threshold (0.0–1.0).")
151
+ @click.option("--save-dir", default=None, help="Directory to save tokenizer artifacts.")
152
+ @click.option("--no-filter", is_flag=True, default=False, help="Skip the relevance filter.")
153
+ @click.option("--output", default=None, help="Save token IDs to a JSON file.")
154
+ @click.option("--quiet", is_flag=True, default=False, help="Suppress progress output.")
155
+ def fetch_cmd(source, topic, tokenizer, model, vocab_size, threshold,
156
+ save_dir, no_filter, output, quiet):
157
+ """
158
+ Ingest SOURCE (URL or file path), filter by topic, and tokenize.
159
+
160
+ \b
161
+ Examples:
162
+ suur_data fetch https://en.wikipedia.org/wiki/Neuroscience --topic "brain"
163
+ suur_data fetch corpus.pdf --topic "machine learning" --tokenizer custom
164
+ suur_data fetch data.txt --no-filter --tokenizer pretrained --model bert
165
+ """
166
+ tokens = suur_data(
167
+ data_location=source,
168
+ topic=topic,
169
+ tokenizer=tokenizer,
170
+ model=model,
171
+ vocab_size=vocab_size,
172
+ threshold=threshold,
173
+ save_dir=save_dir,
174
+ no_filter=no_filter,
175
+ verbose=not quiet,
176
+ )
177
+
178
+ if output:
179
+ with open(output, "w") as f:
180
+ json.dump(tokens, f)
181
+ click.echo(f"Tokens saved to {output}")
182
+ else:
183
+ click.echo(f"Token count: {len(tokens):,}")
184
+ click.echo(f"First 50 tokens: {tokens[:50]}")
185
+
186
+
187
+ @cli.command("models")
188
+ def models_cmd():
189
+ """List supported pretrained model shortcuts."""
190
+ rows = [
191
+ ("gpt2", "GPT-2 (OpenAI)"),
192
+ ("bert", "BERT base uncased"),
193
+ ("roberta", "RoBERTa base"),
194
+ ("distilbert", "DistilBERT base uncased"),
195
+ ("t5", "T5 small"),
196
+ ]
197
+ click.echo("\nSupported pretrained model shortcuts:\n")
198
+ for key, name in rows:
199
+ click.echo(f" {key:<14} {name}")
200
+ click.echo("\nYou can also pass any HuggingFace model ID directly.")
201
+ click.echo('Example: --model "facebook/opt-125m"\n')
202
+
203
+
204
+ @cli.command("formats")
205
+ def formats_cmd():
206
+ """List supported input file formats."""
207
+ fmts = [
208
+ (".txt / .md / .rst", "Plain text, Markdown, reStructuredText"),
209
+ (".pdf", "PDF documents (requires pdfminer.six)"),
210
+ (".docx", "Word documents (requires python-docx)"),
211
+ (".csv / .tsv", "Comma/tab-separated values"),
212
+ (".json", "JSON — recursively flattens key-value pairs"),
213
+ (".html / .htm", "HTML pages (requires beautifulsoup4)"),
214
+ (".epub", "E-books (requires ebooklib + beautifulsoup4)"),
215
+ ("URL", "Any HTTP/HTTPS URL — auto-downloaded"),
216
+ ]
217
+ click.echo("\nSupported input formats:\n")
218
+ for ext, desc in fmts:
219
+ click.echo(f" {ext:<22} {desc}")
220
+ click.echo()
221
+
222
+
223
+ # ---------------------------------------------------------------------------
224
+ # Allow running as: python -m suur_data
225
+ # ---------------------------------------------------------------------------
226
+
227
+ def main():
228
+ cli()
229
+
230
+
231
+ if __name__ == "__main__":
232
+ main()
233
+
suur_data/filter.py ADDED
@@ -0,0 +1,183 @@
1
+ """
2
+ suur_data.filter
3
+ ---------------
4
+ Stage 2 — Neural relevance filter.
5
+
6
+ Architecture:
7
+ - Splits raw text into paragraph-level chunks
8
+ - Embeds chunks using TF-IDF (fast, no GPU needed)
9
+ - Scores each chunk against a topic query using cosine similarity
10
+ - Keeps only chunks above a configurable threshold
11
+ - Falls back to keyword-density scoring if sklearn is missing
12
+
13
+ This is intentionally lightweight so it runs on CPU with no external
14
+ model downloads. For better quality, swap in a sentence-transformers
15
+ embedding in the future.
16
+ """
17
+
18
+ import re
19
+ import math
20
+ from typing import List, Tuple
21
+
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # 1. Text splitter
25
+ # ---------------------------------------------------------------------------
26
+
27
+ def split_chunks(text: str, min_words: int = 20) -> List[str]:
28
+ """
29
+ Split text into chunks (paragraphs or groups of sentences).
30
+ Chunks shorter than min_words are merged with the next one.
31
+ """
32
+ # Split on blank lines first (paragraph boundaries)
33
+ raw = re.split(r"\n{2,}", text)
34
+
35
+ chunks = []
36
+ buffer = ""
37
+ for para in raw:
38
+ para = para.strip()
39
+ if not para:
40
+ continue
41
+ combined = (buffer + " " + para).strip() if buffer else para
42
+ if len(combined.split()) >= min_words:
43
+ chunks.append(combined)
44
+ buffer = ""
45
+ else:
46
+ buffer = combined
47
+
48
+ if buffer:
49
+ chunks.append(buffer)
50
+
51
+ return chunks
52
+
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # 2. TF-IDF based relevance scorer
56
+ # ---------------------------------------------------------------------------
57
+
58
+ def _tokenize(text: str) -> List[str]:
59
+ return re.findall(r"[a-z]+", text.lower())
60
+
61
+
62
+ def _tf(tokens: List[str]) -> dict:
63
+ counts: dict = {}
64
+ for t in tokens:
65
+ counts[t] = counts.get(t, 0) + 1
66
+ total = len(tokens) or 1
67
+ return {t: c / total for t, c in counts.items()}
68
+
69
+
70
+ def _idf(docs: List[List[str]], vocab: set) -> dict:
71
+ N = len(docs)
72
+ idf: dict = {}
73
+ for word in vocab:
74
+ df = sum(1 for doc in docs if word in doc)
75
+ idf[word] = math.log((N + 1) / (df + 1)) + 1
76
+ return idf
77
+
78
+
79
+ def _cosine(a: dict, b: dict) -> float:
80
+ dot = sum(a.get(k, 0) * v for k, v in b.items())
81
+ norm_a = math.sqrt(sum(v * v for v in a.values())) or 1e-9
82
+ norm_b = math.sqrt(sum(v * v for v in b.values())) or 1e-9
83
+ return dot / (norm_a * norm_b)
84
+
85
+
86
+ def _tfidf_score(query_tokens: List[str],
87
+ chunks: List[str]) -> List[float]:
88
+ """Score each chunk against the query using TF-IDF cosine similarity."""
89
+ tokenized = [_tokenize(c) for c in chunks]
90
+ all_docs = [query_tokens] + tokenized
91
+ vocab = set(t for doc in all_docs for t in doc)
92
+
93
+ idf = _idf(all_docs, vocab)
94
+
95
+ def tfidf_vec(tokens):
96
+ tf = _tf(tokens)
97
+ return {t: tf[t] * idf.get(t, 1) for t in tf}
98
+
99
+ q_vec = tfidf_vec(query_tokens)
100
+ scores = [_cosine(q_vec, tfidf_vec(tok)) for tok in tokenized]
101
+ return scores
102
+
103
+
104
+ # ---------------------------------------------------------------------------
105
+ # 3. Optional: sklearn TF-IDF (higher quality)
106
+ # ---------------------------------------------------------------------------
107
+
108
+ def _sklearn_score(query: str, chunks: List[str]) -> List[float]:
109
+ from sklearn.feature_extraction.text import TfidfVectorizer
110
+ from sklearn.metrics.pairwise import cosine_similarity
111
+ import numpy as np
112
+
113
+ docs = [query] + chunks
114
+ vec = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
115
+ matrix = vec.fit_transform(docs)
116
+ q_vec = matrix[0]
117
+ chunk_vecs = matrix[1:]
118
+ sims = cosine_similarity(q_vec, chunk_vecs)[0]
119
+ return sims.tolist()
120
+
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # 4. Main filter function
124
+ # ---------------------------------------------------------------------------
125
+
126
+ def filter_chunks(
127
+ text: str,
128
+ topic: str,
129
+ threshold: float = 0.05,
130
+ min_words: int = 20,
131
+ verbose: bool = True,
132
+ ) -> Tuple[str, List[Tuple[str, float]]]:
133
+ """
134
+ Filter raw text to keep only chunks relevant to `topic`.
135
+
136
+ Returns:
137
+ filtered_text — joined string of kept chunks
138
+ scored — list of (chunk, score) for all chunks (sorted desc)
139
+ """
140
+ chunks = split_chunks(text, min_words=min_words)
141
+ if not chunks:
142
+ return text, []
143
+
144
+ if verbose:
145
+ print(f" Scoring {len(chunks)} chunks against topic: '{topic}' ...")
146
+
147
+ # Use sklearn if available for better quality
148
+ try:
149
+ scores = _sklearn_score(topic, chunks)
150
+ method = "sklearn TF-IDF"
151
+ except ImportError:
152
+ scores = _tfidf_score(_tokenize(topic), chunks)
153
+ method = "built-in TF-IDF"
154
+
155
+ if verbose:
156
+ print(f" Scoring method: {method}")
157
+
158
+ scored = sorted(zip(chunks, scores), key=lambda x: x[1], reverse=True)
159
+
160
+ # Dynamic threshold: if topic is empty/generic, keep everything
161
+ if not topic.strip():
162
+ kept = chunks
163
+ if verbose:
164
+ print(" No topic given — keeping all chunks.")
165
+ else:
166
+ kept = [c for c, s in scored if s >= threshold]
167
+ dropped = len(chunks) - len(kept)
168
+ if verbose:
169
+ top = scored[:3]
170
+ print(f" Kept {len(kept)}/{len(chunks)} chunks (dropped {dropped} below threshold {threshold})")
171
+ print(f" Top scores: {[round(s,4) for _, s in top]}")
172
+
173
+ # If filter is too aggressive, relax automatically
174
+ if len(kept) == 0 and len(chunks) > 0:
175
+ # Keep top 30% regardless
176
+ n = max(1, len(chunks) // 3)
177
+ kept = [c for c, _ in scored[:n]]
178
+ if verbose:
179
+ print(f" Threshold too strict — auto-relaxed, keeping top {n} chunks.")
180
+
181
+ filtered_text = "\n\n".join(kept)
182
+ return filtered_text, scored
183
+
suur_data/ingest.py ADDED
@@ -0,0 +1,165 @@
1
+ """
2
+ suur_data.ingest
3
+ ---------------
4
+ Stage 1 — Data ingestion from URLs and local files.
5
+ Supported: .txt, .pdf, .docx, .csv, .json, .html, .epub, plain URLs
6
+ """
7
+
8
+ import os
9
+ import re
10
+ import csv
11
+ import json
12
+ import tempfile
13
+ import urllib.request
14
+ import urllib.parse
15
+ from pathlib import Path
16
+
17
+
18
+ def _is_url(source: str) -> bool:
19
+ parsed = urllib.parse.urlparse(source)
20
+ return parsed.scheme in ("http", "https", "ftp")
21
+
22
+
23
+ def _download(url: str, dest_dir: str) -> str:
24
+ """Download a URL to a temp file. Returns local path."""
25
+ parsed = urllib.parse.urlparse(url)
26
+ ext = Path(parsed.path).suffix or ".html"
27
+ fd, path = tempfile.mkstemp(suffix=ext, dir=dest_dir)
28
+ os.close(fd)
29
+ print(f" Downloading {url} ...")
30
+ headers = {"User-Agent": "Mozilla/5.0 (suur_data/1.0)"}
31
+ req = urllib.request.Request(url, headers=headers)
32
+ with urllib.request.urlopen(req, timeout=30) as resp, open(path, "wb") as f:
33
+ f.write(resp.read())
34
+ print(f" Saved to {path}")
35
+ return path
36
+
37
+
38
+ def _read_txt(path: str) -> str:
39
+ try:
40
+ import chardet
41
+ raw = open(path, "rb").read()
42
+ enc = chardet.detect(raw)["encoding"] or "utf-8"
43
+ except ImportError:
44
+ enc = "utf-8"
45
+ return open(path, encoding=enc, errors="replace").read()
46
+
47
+
48
+ def _read_pdf(path: str) -> str:
49
+ try:
50
+ from pdfminer.high_level import extract_text
51
+ return extract_text(path)
52
+ except ImportError:
53
+ raise RuntimeError("pdfminer.six is required for PDF support: pip install pdfminer.six")
54
+
55
+
56
+ def _read_docx(path: str) -> str:
57
+ try:
58
+ import docx
59
+ doc = docx.Document(path)
60
+ return "\n".join(p.text for p in doc.paragraphs)
61
+ except ImportError:
62
+ raise RuntimeError("python-docx is required for .docx support: pip install python-docx")
63
+
64
+
65
+ def _read_csv(path: str) -> str:
66
+ rows = []
67
+ with open(path, newline="", encoding="utf-8", errors="replace") as f:
68
+ reader = csv.reader(f)
69
+ for row in reader:
70
+ rows.append(" ".join(row))
71
+ return "\n".join(rows)
72
+
73
+
74
+ def _read_json(path: str) -> str:
75
+ with open(path, encoding="utf-8", errors="replace") as f:
76
+ obj = json.load(f)
77
+
78
+ def _flatten(o, depth=0):
79
+ if isinstance(o, str):
80
+ return o
81
+ if isinstance(o, (int, float, bool)):
82
+ return str(o)
83
+ if isinstance(o, list):
84
+ return "\n".join(_flatten(i, depth) for i in o)
85
+ if isinstance(o, dict):
86
+ parts = []
87
+ for k, v in o.items():
88
+ parts.append(f"{k}: {_flatten(v, depth+1)}")
89
+ return "\n".join(parts)
90
+ return ""
91
+
92
+ return _flatten(obj)
93
+
94
+
95
+ def _read_html(path: str) -> str:
96
+ try:
97
+ from bs4 import BeautifulSoup
98
+ except ImportError:
99
+ raise RuntimeError("beautifulsoup4 is required for HTML support: pip install beautifulsoup4")
100
+ with open(path, encoding="utf-8", errors="replace") as f:
101
+ soup = BeautifulSoup(f.read(), "html.parser")
102
+ for tag in soup(["script", "style", "nav", "footer", "header"]):
103
+ tag.decompose()
104
+ return soup.get_text(separator="\n")
105
+
106
+
107
+ def _read_epub(path: str) -> str:
108
+ try:
109
+ import ebooklib
110
+ from ebooklib import epub
111
+ from bs4 import BeautifulSoup
112
+ except ImportError:
113
+ raise RuntimeError("ebooklib + beautifulsoup4 required for .epub: pip install ebooklib beautifulsoup4")
114
+ book = epub.read_epub(path)
115
+ chapters = []
116
+ for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
117
+ soup = BeautifulSoup(item.get_content(), "html.parser")
118
+ chapters.append(soup.get_text(separator="\n"))
119
+ return "\n\n".join(chapters)
120
+
121
+
122
+ READERS = {
123
+ ".txt": _read_txt,
124
+ ".md": _read_txt,
125
+ ".rst": _read_txt,
126
+ ".pdf": _read_pdf,
127
+ ".docx": _read_docx,
128
+ ".csv": _read_csv,
129
+ ".tsv": _read_csv,
130
+ ".json": _read_json,
131
+ ".html": _read_html,
132
+ ".htm": _read_html,
133
+ ".epub": _read_epub,
134
+ }
135
+
136
+
137
+ def ingest(source: str, tmp_dir: str | None = None) -> str:
138
+ """
139
+ Ingest text from a URL or local file.
140
+ Returns a single raw text string.
141
+ """
142
+ if tmp_dir is None:
143
+ tmp_dir = tempfile.mkdtemp(prefix="suur_data_")
144
+
145
+ if _is_url(source):
146
+ local_path = _download(source, tmp_dir)
147
+ else:
148
+ local_path = os.path.expanduser(source)
149
+ if not os.path.exists(local_path):
150
+ raise FileNotFoundError(f"File not found: {local_path}")
151
+
152
+ ext = Path(local_path).suffix.lower()
153
+
154
+ if ext not in READERS:
155
+ # fallback: try plain text
156
+ print(f" Unknown extension '{ext}', attempting plain-text read.")
157
+ reader = _read_txt
158
+ else:
159
+ reader = READERS[ext]
160
+
161
+ print(f" Reading {ext or 'file'} ...")
162
+ text = reader(local_path)
163
+ print(f" Ingested {len(text):,} characters.")
164
+ return text
165
+
suur_data/tokenizer.py ADDED
@@ -0,0 +1,142 @@
1
+
2
+ import re
3
+ import os
4
+ import json
5
+ import collections
6
+ from typing import List, Optional
7
+
8
+
9
+ def _simple_pretokenize(text: str) -> List[str]:
10
+ return re.findall(r"\b\w+\b|[^\w\s]", text)
11
+
12
+
13
+ def _load_hf_tokenizer(model_name: str):
14
+ try:
15
+ from transformers import AutoTokenizer
16
+ except ImportError:
17
+ raise ImportError("Install transformers: pip install transformers")
18
+ models = {
19
+ "gpt2": "gpt2",
20
+ "bert": "bert-base-uncased",
21
+ "roberta": "roberta-base",
22
+ "distilbert": "distilbert-base-uncased",
23
+ "t5": "t5-small",
24
+ }
25
+ resolved = models.get(model_name.lower(), model_name)
26
+ print(f" Loading pretrained tokenizer: {resolved} ...")
27
+ return AutoTokenizer.from_pretrained(resolved)
28
+
29
+
30
+ def _fallback_word_tokenize(text: str) -> List[int]:
31
+ words = _simple_pretokenize(text)
32
+ vocab: dict = {}
33
+ ids: List[int] = []
34
+ for w in words:
35
+ if w not in vocab:
36
+ vocab[w] = len(vocab)
37
+ ids.append(vocab[w])
38
+ print(f" Fallback tokenizer: vocab_size={len(vocab)}, tokens={len(ids):,}")
39
+ return ids
40
+
41
+
42
+ def tokenize_pretrained(text: str, model_name: str = "gpt2", save_dir: Optional[str] = None) -> List[int]:
43
+ try:
44
+ tok = _load_hf_tokenizer(model_name)
45
+ ids = tok.encode(text, add_special_tokens=False)
46
+ if save_dir:
47
+ os.makedirs(save_dir, exist_ok=True)
48
+ tok.save_pretrained(save_dir)
49
+ print(f" Pretrained tokenizer produced {len(ids):,} tokens.")
50
+ return ids
51
+ except ImportError:
52
+ print(" [WARN] transformers not available, using fallback word tokenizer.")
53
+ return _fallback_word_tokenize(text)
54
+
55
+
56
+ def _get_vocab(text: str) -> dict:
57
+ vocab: dict = collections.Counter()
58
+ for word in _simple_pretokenize(text):
59
+ chars = " ".join(list(word)) + " </w>"
60
+ vocab[chars] += 1
61
+ return dict(vocab)
62
+
63
+
64
+ def _get_pairs(vocab: dict) -> dict:
65
+ pairs: dict = collections.Counter()
66
+ for word, freq in vocab.items():
67
+ symbols = word.split()
68
+ for i in range(len(symbols) - 1):
69
+ pairs[(symbols[i], symbols[i + 1])] += freq
70
+ return pairs
71
+
72
+
73
+ def _merge_vocab(pair, vocab: dict) -> dict:
74
+ new_vocab: dict = {}
75
+ bigram = re.escape(" ".join(pair))
76
+ pattern = re.compile(r"(?<!\S)" + bigram + r"(?!\S)")
77
+ for word in vocab:
78
+ new_vocab[pattern.sub("".join(pair), word)] = vocab[word]
79
+ return new_vocab
80
+
81
+
82
+ def _pure_python_bpe(text: str, vocab_size: int = 1000, save_dir: Optional[str] = None) -> List[int]:
83
+ print(f" Training pure-Python BPE (vocab_size={vocab_size}) ...")
84
+ vocab = _get_vocab(text)
85
+ symbols: set = set()
86
+ for word in vocab:
87
+ symbols.update(word.split())
88
+ merges = []
89
+ while len(symbols) < vocab_size:
90
+ pairs = _get_pairs(vocab)
91
+ if not pairs:
92
+ break
93
+ best = max(pairs, key=pairs.get)
94
+ vocab = _merge_vocab(best, vocab)
95
+ symbols.add("".join(best))
96
+ merges.append(best)
97
+ word_to_id = {s: i for i, s in enumerate(sorted(symbols))}
98
+ ids = []
99
+ for word in _simple_pretokenize(text):
100
+ chars = list(word)
101
+ for a, b in merges:
102
+ i = 0
103
+ new_chars = []
104
+ while i < len(chars):
105
+ if i < len(chars) - 1 and chars[i] == a and chars[i+1] == b:
106
+ new_chars.append(a + b)
107
+ i += 2
108
+ else:
109
+ new_chars.append(chars[i])
110
+ i += 1
111
+ chars = new_chars
112
+ for c in chars:
113
+ ids.append(word_to_id.get(c, 0))
114
+ if save_dir:
115
+ os.makedirs(save_dir, exist_ok=True)
116
+ with open(os.path.join(save_dir, "bpe_vocab.json"), "w") as f:
117
+ json.dump(word_to_id, f, indent=2)
118
+ print(f" Pure-Python BPE: vocab={len(symbols)}, tokens={len(ids):,}")
119
+ return ids
120
+
121
+
122
+ def tokenize_custom(text: str, vocab_size: int = 8000, save_dir: Optional[str] = None) -> List[int]:
123
+ try:
124
+ import tokenizers # noqa
125
+ from tokenizers import Tokenizer
126
+ from tokenizers.models import BPE
127
+ from tokenizers.trainers import BpeTrainer
128
+ from tokenizers.pre_tokenizers import Whitespace
129
+ print(f" Training HF BPE tokenizer (vocab_size={vocab_size}) ...")
130
+ tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
131
+ tokenizer.pre_tokenizer = Whitespace()
132
+ trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]"], min_frequency=2)
133
+ tokenizer.train_from_iterator([text], trainer=trainer)
134
+ if save_dir:
135
+ os.makedirs(save_dir, exist_ok=True)
136
+ tokenizer.save(os.path.join(save_dir, "tokenizer.json"))
137
+ ids = tokenizer.encode(text).ids
138
+ print(f" HF BPE: vocab={tokenizer.get_vocab_size()}, tokens={len(ids):,}")
139
+ return ids
140
+ except ImportError:
141
+ print(" [WARN] tokenizers library not found, using pure-Python BPE.")
142
+ return _pure_python_bpe(text, vocab_size=vocab_size, save_dir=save_dir)
@@ -0,0 +1,209 @@
1
+ Metadata-Version: 2.4
2
+ Name: suur_data
3
+ Version: 1.0.0
4
+ Summary: Intelligent data ingestion and tokenization pipeline
5
+ Author: Your Name
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: Topic :: Text Processing :: Linguistic
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: requests
11
+ Requires-Dist: beautifulsoup4
12
+ Requires-Dist: scikit-learn
13
+ Requires-Dist: numpy
14
+ Requires-Dist: click
15
+ Requires-Dist: chardet
16
+ Provides-Extra: pdf
17
+ Requires-Dist: pdfminer.six; extra == "pdf"
18
+ Provides-Extra: docx
19
+ Requires-Dist: python-docx; extra == "docx"
20
+ Provides-Extra: epub
21
+ Requires-Dist: ebooklib; extra == "epub"
22
+ Provides-Extra: hf
23
+ Requires-Dist: transformers; extra == "hf"
24
+ Requires-Dist: tokenizers; extra == "hf"
25
+ Provides-Extra: all
26
+ Requires-Dist: pdfminer.six; extra == "all"
27
+ Requires-Dist: python-docx; extra == "all"
28
+ Requires-Dist: ebooklib; extra == "all"
29
+ Requires-Dist: transformers; extra == "all"
30
+ Requires-Dist: tokenizers; extra == "all"
31
+ Dynamic: author
32
+ Dynamic: classifier
33
+ Dynamic: description
34
+ Dynamic: description-content-type
35
+ Dynamic: provides-extra
36
+ Dynamic: requires-dist
37
+ Dynamic: requires-python
38
+ Dynamic: summary
39
+
40
+ # Suur Data
41
+
42
+ **Intelligent data ingestion and tokenization pipeline.**
43
+
44
+ Suur Data fetches text from any source, filters it by topic using a neural relevance scorer, then tokenizes it using either a pretrained HuggingFace tokenizer or a custom-trained BPE tokenizer.
45
+
46
+ ---
47
+
48
+ ## Installation
49
+
50
+ ```bash
51
+ # Core (URLs, .txt, .csv, .json, .html)
52
+ pip install -e .
53
+
54
+ # With all optional formats + HuggingFace tokenizers
55
+ pip install -e ".[all]"
56
+ ```
57
+
58
+ ---
59
+
60
+ ## Python API
61
+
62
+ ```python
63
+ from suur_data import suur_data
64
+
65
+ # Minimal — fetches URL, no filter, GPT-2 tokenizer
66
+ tokens = suur_data("https://en.wikipedia.org/wiki/Neuroscience")
67
+
68
+ # Filter by topic, custom BPE tokenizer
69
+ tokens = suur_data(
70
+ "research_paper.pdf",
71
+ topic="quantum computing",
72
+ tokenizer="custom",
73
+ vocab_size=4000,
74
+ save_dir="./my_tokenizer",
75
+ )
76
+
77
+ # Local file, pretrained BERT tokenizer, strict filter
78
+ tokens = suur_data(
79
+ "~/corpus/biology.txt",
80
+ topic="cell biology",
81
+ tokenizer="pretrained",
82
+ model="bert",
83
+ threshold=0.10,
84
+ )
85
+
86
+ # Skip the filter entirely
87
+ tokens = suur_data("data.csv", no_filter=True)
88
+
89
+ print(tokens[:20]) # list of integer token IDs
90
+ print(len(tokens)) # total token count
91
+ ```
92
+
93
+ ### Parameters
94
+
95
+ | Parameter | Type | Default | Description |
96
+ |-----------|------|---------|-------------|
97
+ | `data_location` | str | — | URL or local file path |
98
+ | `topic` | str | `""` | Subject for relevance filtering (empty = skip filter) |
99
+ | `tokenizer` | str | `"pretrained"` | `"pretrained"` or `"custom"` |
100
+ | `model` | str | `"gpt2"` | HuggingFace model shortcut or full ID |
101
+ | `vocab_size` | int | `8000` | BPE vocab size for custom tokenizer |
102
+ | `threshold` | float | `0.05`` | Cosine similarity cutoff (0.0–1.0) |
103
+ | `save_dir` | str | `None` | Path to save tokenizer files |
104
+ | `no_filter` | bool | `False` | Skip the relevance filter |
105
+ | `verbose` | bool | `True` | Show progress output |
106
+
107
+ ### Returns
108
+ `List[int]` — flat list of integer token IDs.
109
+
110
+ ---
111
+
112
+ ## CLI
113
+
114
+ ```bash
115
+ # Basic URL fetch
116
+ suur_data fetch https://example.com/article --topic "machine learning"
117
+
118
+ # PDF with custom BPE tokenizer
119
+ suur_data fetch paper.pdf --topic "protein folding" --tokenizer custom --vocab-size 6000
120
+
121
+ # Local file, pretrained BERT, save tokenizer
122
+ suur_data fetch corpus.txt --tokenizer pretrained --model bert --save-dir ./bert_tok
123
+
124
+ # Skip filter, save tokens to file
125
+ suur_data fetch data.json --no-filter --output tokens.json
126
+
127
+ # See supported models
128
+ suur_data models
129
+
130
+ # See supported file formats
131
+ suur_data formats
132
+ ```
133
+
134
+ ---
135
+
136
+ ## Supported Input Formats
137
+
138
+ | Format | Notes |
139
+ |--------|-------|
140
+ | `.txt`, `.md`, `.rst` | Plain text |
141
+ | `.pdf` | Requires `pdfminer.six` |
142
+ | `.docx` | Requires `python-docx` |
143
+ | `.csv`, `.tsv` | All cells joined as text |
144
+ | `.json` | Recursively flattened key-value pairs |
145
+ | `.html`, `.htm` | Scripts/styles stripped (requires `beautifulsoup4`) |
146
+ | `.epub` | E-books (requires `ebooklib` + `beautifulsoup4`) |
147
+ | HTTP/HTTPS URL | Auto-downloaded, then parsed by extension |
148
+
149
+ ---
150
+
151
+ ## Pretrained Model Shortcuts
152
+
153
+ | Shortcut | Model |
154
+ |----------|-------|
155
+ | `gpt2` | GPT-2 (OpenAI) |
156
+ | `bert` | BERT base uncased |
157
+ | `roberta` | RoBERTa base |
158
+ | `distilbert` | DistilBERT base uncased |
159
+ | `t5` | T5 small |
160
+
161
+ You can also pass any HuggingFace Hub model ID directly:
162
+ ```
163
+ --model "facebook/opt-125m"
164
+ ```
165
+
166
+ ---
167
+
168
+ ## Architecture
169
+
170
+ ```
171
+ Source (URL / file)
172
+
173
+
174
+ Stage 1: Ingest
175
+ Handles 8 file types + HTTP download
176
+
177
+
178
+ Stage 2: Neural Filter
179
+ Splits into paragraph chunks
180
+ Scores each chunk against topic via TF-IDF cosine similarity
181
+ Drops chunks below threshold
182
+
183
+
184
+ Stage 3: Tokenize
185
+ ┌─────────────────────┐ ┌────────────────────────────┐
186
+ │ Pretrained mode │ │ Custom mode │
187
+ │ HuggingFace │ │ BPE trainer (HF library │
188
+ │ AutoTokenizer │ │ or pure-Python fallback) │
189
+ └─────────────────────┘ └────────────────────────────┘
190
+
191
+
192
+ List[int] ← token IDs
193
+ ```
194
+
195
+ ---
196
+
197
+ ## Dependency Matrix
198
+
199
+ | Feature | Required packages |
200
+ |---------|------------------|
201
+ | Core pipeline | `requests`, `beautifulsoup4`, `scikit-learn`, `numpy`, `click`, `chardet` |
202
+ | PDF support | `pdfminer.six` |
203
+ | .docx support | `python-docx` |
204
+ | .epub support | `ebooklib` |
205
+ | Pretrained tokenizers | `transformers` |
206
+ | Fast BPE training | `tokenizers` |
207
+
208
+ All optional — the tool degrades gracefully with built-in fallbacks when optional packages are missing.
209
+
@@ -0,0 +1,9 @@
1
+ suur_data/__init__.py,sha256=NN4R90oZcJIzHtwn8L3sksZ0bFAVQTbvQF-6--hwIvU,8064
2
+ suur_data/filter.py,sha256=HCp_j35kC2UeKV_QthbW5SV5NUwS2F9UKF72lxIWljM,5804
3
+ suur_data/ingest.py,sha256=c2d5d4wVawxdd2FG_Ecsxb5bRLvGz-PiFAHcIoPvxZo,4811
4
+ suur_data/tokenizer.py,sha256=uDnGTKBigXdMQkPcJm-ebC62Sim7fHQnsTOavy6Ir5k,4989
5
+ suur_data-1.0.0.dist-info/METADATA,sha256=rmBT0QEdEvx6A_EdMrQYM7e8LIl-Tz6VL83tFFFC9A0,5856
6
+ suur_data-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
7
+ suur_data-1.0.0.dist-info/entry_points.txt,sha256=TKKBMSX0wa1BroERW5ftHglvR2-Fy_8fxij3XmqnO7o,45
8
+ suur_data-1.0.0.dist-info/top_level.txt,sha256=8ooHxzPfqZ56sFPb_U6tRTKdxXBlsBrZOhDHWO9g_x4,10
9
+ suur_data-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ suur-data = suur_data:main
@@ -0,0 +1 @@
1
+ suur_data