suur-data 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- suur_data/__init__.py +233 -0
- suur_data/filter.py +183 -0
- suur_data/ingest.py +165 -0
- suur_data/tokenizer.py +142 -0
- suur_data-1.0.0.dist-info/METADATA +209 -0
- suur_data-1.0.0.dist-info/RECORD +9 -0
- suur_data-1.0.0.dist-info/WHEEL +5 -0
- suur_data-1.0.0.dist-info/entry_points.txt +2 -0
- suur_data-1.0.0.dist-info/top_level.txt +1 -0
suur_data/__init__.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""
|
|
2
|
+
suur_data
|
|
3
|
+
========
|
|
4
|
+
A smart data ingestion and tokenization pipeline.
|
|
5
|
+
|
|
6
|
+
Python API usage:
|
|
7
|
+
from suur_data import suur_data
|
|
8
|
+
|
|
9
|
+
tokens = suur_data("https://example.com/article.html")
|
|
10
|
+
tokens = suur_data("my_corpus.pdf", topic="quantum computing")
|
|
11
|
+
tokens = suur_data("data.txt", tokenizer="custom", vocab_size=4000)
|
|
12
|
+
|
|
13
|
+
CLI usage:
|
|
14
|
+
suur_data fetch <source> [OPTIONS]
|
|
15
|
+
|
|
16
|
+
Options:
|
|
17
|
+
--topic TEXT Topic/subject to filter content by
|
|
18
|
+
--tokenizer [pretrained|custom]
|
|
19
|
+
--model TEXT Pretrained model name (default: gpt2)
|
|
20
|
+
--vocab-size INT BPE vocab size for custom mode (default: 8000)
|
|
21
|
+
--threshold FLOAT Relevance threshold 0.0–1.0 (default: 0.05)
|
|
22
|
+
--save-dir PATH Where to save tokenizer artifacts
|
|
23
|
+
--no-filter Skip the relevance filter
|
|
24
|
+
--verbose / --quiet
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import os
|
|
30
|
+
import sys
|
|
31
|
+
import json
|
|
32
|
+
import tempfile
|
|
33
|
+
from typing import List, Optional
|
|
34
|
+
|
|
35
|
+
import click
|
|
36
|
+
|
|
37
|
+
from .ingest import ingest
|
|
38
|
+
from .filter import filter_chunks
|
|
39
|
+
from .tokenizer import tokenize_pretrained, tokenize_custom
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
# Python API
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
def suur_data(
|
|
47
|
+
data_location: str,
|
|
48
|
+
topic: str = "",
|
|
49
|
+
tokenizer: str = "pretrained",
|
|
50
|
+
model: str = "gpt2",
|
|
51
|
+
vocab_size: int = 8000,
|
|
52
|
+
threshold: float = 0.05,
|
|
53
|
+
save_dir: Optional[str] = None,
|
|
54
|
+
no_filter: bool = False,
|
|
55
|
+
verbose: bool = True,
|
|
56
|
+
) -> List[int]:
|
|
57
|
+
"""
|
|
58
|
+
End-to-end pipeline: ingest → filter → tokenize.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
data_location : str
|
|
63
|
+
URL or local file path. Supports .txt, .pdf, .docx, .csv,
|
|
64
|
+
.json, .html, .htm, .epub, .md, .rst
|
|
65
|
+
topic : str
|
|
66
|
+
Subject/keyword for the neural relevance filter.
|
|
67
|
+
Leave empty to skip filtering.
|
|
68
|
+
tokenizer : str
|
|
69
|
+
"pretrained" (default) or "custom".
|
|
70
|
+
model : str
|
|
71
|
+
HuggingFace model name for pretrained mode (default: "gpt2").
|
|
72
|
+
vocab_size : int
|
|
73
|
+
BPE vocabulary size for custom mode (default: 8000).
|
|
74
|
+
threshold : float
|
|
75
|
+
Cosine similarity cutoff for relevance filter (default: 0.05).
|
|
76
|
+
save_dir : str | None
|
|
77
|
+
Directory to save tokenizer files. None = don't save.
|
|
78
|
+
no_filter : bool
|
|
79
|
+
If True, skip the relevance filter and tokenize everything.
|
|
80
|
+
verbose : bool
|
|
81
|
+
Print progress information.
|
|
82
|
+
|
|
83
|
+
Returns
|
|
84
|
+
-------
|
|
85
|
+
List[int]
|
|
86
|
+
List of integer token IDs.
|
|
87
|
+
"""
|
|
88
|
+
tmp = tempfile.mkdtemp(prefix="suur_data_")
|
|
89
|
+
|
|
90
|
+
# --- Stage 1: Ingest ---
|
|
91
|
+
if verbose:
|
|
92
|
+
print(f"\n[suur_data] Stage 1 — Ingesting: {data_location}")
|
|
93
|
+
raw_text = ingest(data_location, tmp_dir=tmp)
|
|
94
|
+
|
|
95
|
+
# --- Stage 2: Filter ---
|
|
96
|
+
if no_filter or not topic.strip():
|
|
97
|
+
if verbose:
|
|
98
|
+
if no_filter:
|
|
99
|
+
print("[suur_data] Stage 2 — Filter: SKIPPED (--no-filter)")
|
|
100
|
+
else:
|
|
101
|
+
print("[suur_data] Stage 2 — Filter: SKIPPED (no topic given)")
|
|
102
|
+
filtered_text = raw_text
|
|
103
|
+
else:
|
|
104
|
+
if verbose:
|
|
105
|
+
print(f"[suur_data] Stage 2 — Neural Filter (topic: '{topic}')")
|
|
106
|
+
filtered_text, _ = filter_chunks(
|
|
107
|
+
raw_text,
|
|
108
|
+
topic=topic,
|
|
109
|
+
threshold=threshold,
|
|
110
|
+
verbose=verbose,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# --- Stage 3: Tokenize ---
|
|
114
|
+
if verbose:
|
|
115
|
+
print(f"[suur_data] Stage 3 — Tokenizing ({tokenizer} mode)")
|
|
116
|
+
|
|
117
|
+
if tokenizer == "custom":
|
|
118
|
+
tokens = tokenize_custom(filtered_text, vocab_size=vocab_size, save_dir=save_dir)
|
|
119
|
+
else:
|
|
120
|
+
tokens = tokenize_pretrained(filtered_text, model_name=model, save_dir=save_dir)
|
|
121
|
+
|
|
122
|
+
if verbose:
|
|
123
|
+
print(f"\n[suur_data] Done. Total tokens: {len(tokens):,}\n")
|
|
124
|
+
|
|
125
|
+
return tokens
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ---------------------------------------------------------------------------
|
|
129
|
+
# CLI
|
|
130
|
+
# ---------------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
@click.group()
|
|
133
|
+
@click.version_option("1.0.0", prog_name="suur-data")
|
|
134
|
+
def cli():
|
|
135
|
+
"""
|
|
136
|
+
\b
|
|
137
|
+
Suur Data — intelligent data ingestion and tokenization pipeline.
|
|
138
|
+
|
|
139
|
+
Fetch any text source, filter it by topic using TF-IDF relevance
|
|
140
|
+
scoring, then tokenize with a pretrained or custom BPE tokenizer.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@cli.command("fetch")
|
|
145
|
+
@click.argument("source")
|
|
146
|
+
@click.option("--topic", default="", show_default=True, help="Topic for relevance filtering.")
|
|
147
|
+
@click.option("--tokenizer", default="pretrained", show_default=True, type=click.Choice(["pretrained", "custom"]), help="Tokenizer mode.")
|
|
148
|
+
@click.option("--model", default="gpt2", show_default=True, help="Pretrained model name (HuggingFace).")
|
|
149
|
+
@click.option("--vocab-size", default=8000, show_default=True, help="BPE vocab size for custom tokenizer.")
|
|
150
|
+
@click.option("--threshold", default=0.05, show_default=True, help="Relevance filter threshold (0.0–1.0).")
|
|
151
|
+
@click.option("--save-dir", default=None, help="Directory to save tokenizer artifacts.")
|
|
152
|
+
@click.option("--no-filter", is_flag=True, default=False, help="Skip the relevance filter.")
|
|
153
|
+
@click.option("--output", default=None, help="Save token IDs to a JSON file.")
|
|
154
|
+
@click.option("--quiet", is_flag=True, default=False, help="Suppress progress output.")
|
|
155
|
+
def fetch_cmd(source, topic, tokenizer, model, vocab_size, threshold,
|
|
156
|
+
save_dir, no_filter, output, quiet):
|
|
157
|
+
"""
|
|
158
|
+
Ingest SOURCE (URL or file path), filter by topic, and tokenize.
|
|
159
|
+
|
|
160
|
+
\b
|
|
161
|
+
Examples:
|
|
162
|
+
suur_data fetch https://en.wikipedia.org/wiki/Neuroscience --topic "brain"
|
|
163
|
+
suur_data fetch corpus.pdf --topic "machine learning" --tokenizer custom
|
|
164
|
+
suur_data fetch data.txt --no-filter --tokenizer pretrained --model bert
|
|
165
|
+
"""
|
|
166
|
+
tokens = suur_data(
|
|
167
|
+
data_location=source,
|
|
168
|
+
topic=topic,
|
|
169
|
+
tokenizer=tokenizer,
|
|
170
|
+
model=model,
|
|
171
|
+
vocab_size=vocab_size,
|
|
172
|
+
threshold=threshold,
|
|
173
|
+
save_dir=save_dir,
|
|
174
|
+
no_filter=no_filter,
|
|
175
|
+
verbose=not quiet,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if output:
|
|
179
|
+
with open(output, "w") as f:
|
|
180
|
+
json.dump(tokens, f)
|
|
181
|
+
click.echo(f"Tokens saved to {output}")
|
|
182
|
+
else:
|
|
183
|
+
click.echo(f"Token count: {len(tokens):,}")
|
|
184
|
+
click.echo(f"First 50 tokens: {tokens[:50]}")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@cli.command("models")
|
|
188
|
+
def models_cmd():
|
|
189
|
+
"""List supported pretrained model shortcuts."""
|
|
190
|
+
rows = [
|
|
191
|
+
("gpt2", "GPT-2 (OpenAI)"),
|
|
192
|
+
("bert", "BERT base uncased"),
|
|
193
|
+
("roberta", "RoBERTa base"),
|
|
194
|
+
("distilbert", "DistilBERT base uncased"),
|
|
195
|
+
("t5", "T5 small"),
|
|
196
|
+
]
|
|
197
|
+
click.echo("\nSupported pretrained model shortcuts:\n")
|
|
198
|
+
for key, name in rows:
|
|
199
|
+
click.echo(f" {key:<14} {name}")
|
|
200
|
+
click.echo("\nYou can also pass any HuggingFace model ID directly.")
|
|
201
|
+
click.echo('Example: --model "facebook/opt-125m"\n')
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
@cli.command("formats")
|
|
205
|
+
def formats_cmd():
|
|
206
|
+
"""List supported input file formats."""
|
|
207
|
+
fmts = [
|
|
208
|
+
(".txt / .md / .rst", "Plain text, Markdown, reStructuredText"),
|
|
209
|
+
(".pdf", "PDF documents (requires pdfminer.six)"),
|
|
210
|
+
(".docx", "Word documents (requires python-docx)"),
|
|
211
|
+
(".csv / .tsv", "Comma/tab-separated values"),
|
|
212
|
+
(".json", "JSON — recursively flattens key-value pairs"),
|
|
213
|
+
(".html / .htm", "HTML pages (requires beautifulsoup4)"),
|
|
214
|
+
(".epub", "E-books (requires ebooklib + beautifulsoup4)"),
|
|
215
|
+
("URL", "Any HTTP/HTTPS URL — auto-downloaded"),
|
|
216
|
+
]
|
|
217
|
+
click.echo("\nSupported input formats:\n")
|
|
218
|
+
for ext, desc in fmts:
|
|
219
|
+
click.echo(f" {ext:<22} {desc}")
|
|
220
|
+
click.echo()
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# ---------------------------------------------------------------------------
|
|
224
|
+
# Allow running as: python -m suur_data
|
|
225
|
+
# ---------------------------------------------------------------------------
|
|
226
|
+
|
|
227
|
+
def main():
|
|
228
|
+
cli()
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
if __name__ == "__main__":
|
|
232
|
+
main()
|
|
233
|
+
|
suur_data/filter.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""
|
|
2
|
+
suur_data.filter
|
|
3
|
+
---------------
|
|
4
|
+
Stage 2 — Neural relevance filter.
|
|
5
|
+
|
|
6
|
+
Architecture:
|
|
7
|
+
- Splits raw text into paragraph-level chunks
|
|
8
|
+
- Embeds chunks using TF-IDF (fast, no GPU needed)
|
|
9
|
+
- Scores each chunk against a topic query using cosine similarity
|
|
10
|
+
- Keeps only chunks above a configurable threshold
|
|
11
|
+
- Falls back to keyword-density scoring if sklearn is missing
|
|
12
|
+
|
|
13
|
+
This is intentionally lightweight so it runs on CPU with no external
|
|
14
|
+
model downloads. For better quality, swap in a sentence-transformers
|
|
15
|
+
embedding in the future.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import re
|
|
19
|
+
import math
|
|
20
|
+
from typing import List, Tuple
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
# 1. Text splitter
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
def split_chunks(text: str, min_words: int = 20) -> List[str]:
|
|
28
|
+
"""
|
|
29
|
+
Split text into chunks (paragraphs or groups of sentences).
|
|
30
|
+
Chunks shorter than min_words are merged with the next one.
|
|
31
|
+
"""
|
|
32
|
+
# Split on blank lines first (paragraph boundaries)
|
|
33
|
+
raw = re.split(r"\n{2,}", text)
|
|
34
|
+
|
|
35
|
+
chunks = []
|
|
36
|
+
buffer = ""
|
|
37
|
+
for para in raw:
|
|
38
|
+
para = para.strip()
|
|
39
|
+
if not para:
|
|
40
|
+
continue
|
|
41
|
+
combined = (buffer + " " + para).strip() if buffer else para
|
|
42
|
+
if len(combined.split()) >= min_words:
|
|
43
|
+
chunks.append(combined)
|
|
44
|
+
buffer = ""
|
|
45
|
+
else:
|
|
46
|
+
buffer = combined
|
|
47
|
+
|
|
48
|
+
if buffer:
|
|
49
|
+
chunks.append(buffer)
|
|
50
|
+
|
|
51
|
+
return chunks
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
# 2. TF-IDF based relevance scorer
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
def _tokenize(text: str) -> List[str]:
|
|
59
|
+
return re.findall(r"[a-z]+", text.lower())
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _tf(tokens: List[str]) -> dict:
|
|
63
|
+
counts: dict = {}
|
|
64
|
+
for t in tokens:
|
|
65
|
+
counts[t] = counts.get(t, 0) + 1
|
|
66
|
+
total = len(tokens) or 1
|
|
67
|
+
return {t: c / total for t, c in counts.items()}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _idf(docs: List[List[str]], vocab: set) -> dict:
|
|
71
|
+
N = len(docs)
|
|
72
|
+
idf: dict = {}
|
|
73
|
+
for word in vocab:
|
|
74
|
+
df = sum(1 for doc in docs if word in doc)
|
|
75
|
+
idf[word] = math.log((N + 1) / (df + 1)) + 1
|
|
76
|
+
return idf
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _cosine(a: dict, b: dict) -> float:
|
|
80
|
+
dot = sum(a.get(k, 0) * v for k, v in b.items())
|
|
81
|
+
norm_a = math.sqrt(sum(v * v for v in a.values())) or 1e-9
|
|
82
|
+
norm_b = math.sqrt(sum(v * v for v in b.values())) or 1e-9
|
|
83
|
+
return dot / (norm_a * norm_b)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _tfidf_score(query_tokens: List[str],
|
|
87
|
+
chunks: List[str]) -> List[float]:
|
|
88
|
+
"""Score each chunk against the query using TF-IDF cosine similarity."""
|
|
89
|
+
tokenized = [_tokenize(c) for c in chunks]
|
|
90
|
+
all_docs = [query_tokens] + tokenized
|
|
91
|
+
vocab = set(t for doc in all_docs for t in doc)
|
|
92
|
+
|
|
93
|
+
idf = _idf(all_docs, vocab)
|
|
94
|
+
|
|
95
|
+
def tfidf_vec(tokens):
|
|
96
|
+
tf = _tf(tokens)
|
|
97
|
+
return {t: tf[t] * idf.get(t, 1) for t in tf}
|
|
98
|
+
|
|
99
|
+
q_vec = tfidf_vec(query_tokens)
|
|
100
|
+
scores = [_cosine(q_vec, tfidf_vec(tok)) for tok in tokenized]
|
|
101
|
+
return scores
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# ---------------------------------------------------------------------------
|
|
105
|
+
# 3. Optional: sklearn TF-IDF (higher quality)
|
|
106
|
+
# ---------------------------------------------------------------------------
|
|
107
|
+
|
|
108
|
+
def _sklearn_score(query: str, chunks: List[str]) -> List[float]:
|
|
109
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
110
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
111
|
+
import numpy as np
|
|
112
|
+
|
|
113
|
+
docs = [query] + chunks
|
|
114
|
+
vec = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
|
|
115
|
+
matrix = vec.fit_transform(docs)
|
|
116
|
+
q_vec = matrix[0]
|
|
117
|
+
chunk_vecs = matrix[1:]
|
|
118
|
+
sims = cosine_similarity(q_vec, chunk_vecs)[0]
|
|
119
|
+
return sims.tolist()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ---------------------------------------------------------------------------
|
|
123
|
+
# 4. Main filter function
|
|
124
|
+
# ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
def filter_chunks(
|
|
127
|
+
text: str,
|
|
128
|
+
topic: str,
|
|
129
|
+
threshold: float = 0.05,
|
|
130
|
+
min_words: int = 20,
|
|
131
|
+
verbose: bool = True,
|
|
132
|
+
) -> Tuple[str, List[Tuple[str, float]]]:
|
|
133
|
+
"""
|
|
134
|
+
Filter raw text to keep only chunks relevant to `topic`.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
filtered_text — joined string of kept chunks
|
|
138
|
+
scored — list of (chunk, score) for all chunks (sorted desc)
|
|
139
|
+
"""
|
|
140
|
+
chunks = split_chunks(text, min_words=min_words)
|
|
141
|
+
if not chunks:
|
|
142
|
+
return text, []
|
|
143
|
+
|
|
144
|
+
if verbose:
|
|
145
|
+
print(f" Scoring {len(chunks)} chunks against topic: '{topic}' ...")
|
|
146
|
+
|
|
147
|
+
# Use sklearn if available for better quality
|
|
148
|
+
try:
|
|
149
|
+
scores = _sklearn_score(topic, chunks)
|
|
150
|
+
method = "sklearn TF-IDF"
|
|
151
|
+
except ImportError:
|
|
152
|
+
scores = _tfidf_score(_tokenize(topic), chunks)
|
|
153
|
+
method = "built-in TF-IDF"
|
|
154
|
+
|
|
155
|
+
if verbose:
|
|
156
|
+
print(f" Scoring method: {method}")
|
|
157
|
+
|
|
158
|
+
scored = sorted(zip(chunks, scores), key=lambda x: x[1], reverse=True)
|
|
159
|
+
|
|
160
|
+
# Dynamic threshold: if topic is empty/generic, keep everything
|
|
161
|
+
if not topic.strip():
|
|
162
|
+
kept = chunks
|
|
163
|
+
if verbose:
|
|
164
|
+
print(" No topic given — keeping all chunks.")
|
|
165
|
+
else:
|
|
166
|
+
kept = [c for c, s in scored if s >= threshold]
|
|
167
|
+
dropped = len(chunks) - len(kept)
|
|
168
|
+
if verbose:
|
|
169
|
+
top = scored[:3]
|
|
170
|
+
print(f" Kept {len(kept)}/{len(chunks)} chunks (dropped {dropped} below threshold {threshold})")
|
|
171
|
+
print(f" Top scores: {[round(s,4) for _, s in top]}")
|
|
172
|
+
|
|
173
|
+
# If filter is too aggressive, relax automatically
|
|
174
|
+
if len(kept) == 0 and len(chunks) > 0:
|
|
175
|
+
# Keep top 30% regardless
|
|
176
|
+
n = max(1, len(chunks) // 3)
|
|
177
|
+
kept = [c for c, _ in scored[:n]]
|
|
178
|
+
if verbose:
|
|
179
|
+
print(f" Threshold too strict — auto-relaxed, keeping top {n} chunks.")
|
|
180
|
+
|
|
181
|
+
filtered_text = "\n\n".join(kept)
|
|
182
|
+
return filtered_text, scored
|
|
183
|
+
|
suur_data/ingest.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
suur_data.ingest
|
|
3
|
+
---------------
|
|
4
|
+
Stage 1 — Data ingestion from URLs and local files.
|
|
5
|
+
Supported: .txt, .pdf, .docx, .csv, .json, .html, .epub, plain URLs
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
import csv
|
|
11
|
+
import json
|
|
12
|
+
import tempfile
|
|
13
|
+
import urllib.request
|
|
14
|
+
import urllib.parse
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _is_url(source: str) -> bool:
|
|
19
|
+
parsed = urllib.parse.urlparse(source)
|
|
20
|
+
return parsed.scheme in ("http", "https", "ftp")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _download(url: str, dest_dir: str) -> str:
|
|
24
|
+
"""Download a URL to a temp file. Returns local path."""
|
|
25
|
+
parsed = urllib.parse.urlparse(url)
|
|
26
|
+
ext = Path(parsed.path).suffix or ".html"
|
|
27
|
+
fd, path = tempfile.mkstemp(suffix=ext, dir=dest_dir)
|
|
28
|
+
os.close(fd)
|
|
29
|
+
print(f" Downloading {url} ...")
|
|
30
|
+
headers = {"User-Agent": "Mozilla/5.0 (suur_data/1.0)"}
|
|
31
|
+
req = urllib.request.Request(url, headers=headers)
|
|
32
|
+
with urllib.request.urlopen(req, timeout=30) as resp, open(path, "wb") as f:
|
|
33
|
+
f.write(resp.read())
|
|
34
|
+
print(f" Saved to {path}")
|
|
35
|
+
return path
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _read_txt(path: str) -> str:
|
|
39
|
+
try:
|
|
40
|
+
import chardet
|
|
41
|
+
raw = open(path, "rb").read()
|
|
42
|
+
enc = chardet.detect(raw)["encoding"] or "utf-8"
|
|
43
|
+
except ImportError:
|
|
44
|
+
enc = "utf-8"
|
|
45
|
+
return open(path, encoding=enc, errors="replace").read()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _read_pdf(path: str) -> str:
|
|
49
|
+
try:
|
|
50
|
+
from pdfminer.high_level import extract_text
|
|
51
|
+
return extract_text(path)
|
|
52
|
+
except ImportError:
|
|
53
|
+
raise RuntimeError("pdfminer.six is required for PDF support: pip install pdfminer.six")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _read_docx(path: str) -> str:
|
|
57
|
+
try:
|
|
58
|
+
import docx
|
|
59
|
+
doc = docx.Document(path)
|
|
60
|
+
return "\n".join(p.text for p in doc.paragraphs)
|
|
61
|
+
except ImportError:
|
|
62
|
+
raise RuntimeError("python-docx is required for .docx support: pip install python-docx")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _read_csv(path: str) -> str:
|
|
66
|
+
rows = []
|
|
67
|
+
with open(path, newline="", encoding="utf-8", errors="replace") as f:
|
|
68
|
+
reader = csv.reader(f)
|
|
69
|
+
for row in reader:
|
|
70
|
+
rows.append(" ".join(row))
|
|
71
|
+
return "\n".join(rows)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _read_json(path: str) -> str:
|
|
75
|
+
with open(path, encoding="utf-8", errors="replace") as f:
|
|
76
|
+
obj = json.load(f)
|
|
77
|
+
|
|
78
|
+
def _flatten(o, depth=0):
|
|
79
|
+
if isinstance(o, str):
|
|
80
|
+
return o
|
|
81
|
+
if isinstance(o, (int, float, bool)):
|
|
82
|
+
return str(o)
|
|
83
|
+
if isinstance(o, list):
|
|
84
|
+
return "\n".join(_flatten(i, depth) for i in o)
|
|
85
|
+
if isinstance(o, dict):
|
|
86
|
+
parts = []
|
|
87
|
+
for k, v in o.items():
|
|
88
|
+
parts.append(f"{k}: {_flatten(v, depth+1)}")
|
|
89
|
+
return "\n".join(parts)
|
|
90
|
+
return ""
|
|
91
|
+
|
|
92
|
+
return _flatten(obj)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _read_html(path: str) -> str:
|
|
96
|
+
try:
|
|
97
|
+
from bs4 import BeautifulSoup
|
|
98
|
+
except ImportError:
|
|
99
|
+
raise RuntimeError("beautifulsoup4 is required for HTML support: pip install beautifulsoup4")
|
|
100
|
+
with open(path, encoding="utf-8", errors="replace") as f:
|
|
101
|
+
soup = BeautifulSoup(f.read(), "html.parser")
|
|
102
|
+
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
|
103
|
+
tag.decompose()
|
|
104
|
+
return soup.get_text(separator="\n")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _read_epub(path: str) -> str:
|
|
108
|
+
try:
|
|
109
|
+
import ebooklib
|
|
110
|
+
from ebooklib import epub
|
|
111
|
+
from bs4 import BeautifulSoup
|
|
112
|
+
except ImportError:
|
|
113
|
+
raise RuntimeError("ebooklib + beautifulsoup4 required for .epub: pip install ebooklib beautifulsoup4")
|
|
114
|
+
book = epub.read_epub(path)
|
|
115
|
+
chapters = []
|
|
116
|
+
for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
|
117
|
+
soup = BeautifulSoup(item.get_content(), "html.parser")
|
|
118
|
+
chapters.append(soup.get_text(separator="\n"))
|
|
119
|
+
return "\n\n".join(chapters)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
READERS = {
|
|
123
|
+
".txt": _read_txt,
|
|
124
|
+
".md": _read_txt,
|
|
125
|
+
".rst": _read_txt,
|
|
126
|
+
".pdf": _read_pdf,
|
|
127
|
+
".docx": _read_docx,
|
|
128
|
+
".csv": _read_csv,
|
|
129
|
+
".tsv": _read_csv,
|
|
130
|
+
".json": _read_json,
|
|
131
|
+
".html": _read_html,
|
|
132
|
+
".htm": _read_html,
|
|
133
|
+
".epub": _read_epub,
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def ingest(source: str, tmp_dir: str | None = None) -> str:
|
|
138
|
+
"""
|
|
139
|
+
Ingest text from a URL or local file.
|
|
140
|
+
Returns a single raw text string.
|
|
141
|
+
"""
|
|
142
|
+
if tmp_dir is None:
|
|
143
|
+
tmp_dir = tempfile.mkdtemp(prefix="suur_data_")
|
|
144
|
+
|
|
145
|
+
if _is_url(source):
|
|
146
|
+
local_path = _download(source, tmp_dir)
|
|
147
|
+
else:
|
|
148
|
+
local_path = os.path.expanduser(source)
|
|
149
|
+
if not os.path.exists(local_path):
|
|
150
|
+
raise FileNotFoundError(f"File not found: {local_path}")
|
|
151
|
+
|
|
152
|
+
ext = Path(local_path).suffix.lower()
|
|
153
|
+
|
|
154
|
+
if ext not in READERS:
|
|
155
|
+
# fallback: try plain text
|
|
156
|
+
print(f" Unknown extension '{ext}', attempting plain-text read.")
|
|
157
|
+
reader = _read_txt
|
|
158
|
+
else:
|
|
159
|
+
reader = READERS[ext]
|
|
160
|
+
|
|
161
|
+
print(f" Reading {ext or 'file'} ...")
|
|
162
|
+
text = reader(local_path)
|
|
163
|
+
print(f" Ingested {len(text):,} characters.")
|
|
164
|
+
return text
|
|
165
|
+
|
suur_data/tokenizer.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
|
|
2
|
+
import re
|
|
3
|
+
import os
|
|
4
|
+
import json
|
|
5
|
+
import collections
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _simple_pretokenize(text: str) -> List[str]:
|
|
10
|
+
return re.findall(r"\b\w+\b|[^\w\s]", text)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _load_hf_tokenizer(model_name: str):
|
|
14
|
+
try:
|
|
15
|
+
from transformers import AutoTokenizer
|
|
16
|
+
except ImportError:
|
|
17
|
+
raise ImportError("Install transformers: pip install transformers")
|
|
18
|
+
models = {
|
|
19
|
+
"gpt2": "gpt2",
|
|
20
|
+
"bert": "bert-base-uncased",
|
|
21
|
+
"roberta": "roberta-base",
|
|
22
|
+
"distilbert": "distilbert-base-uncased",
|
|
23
|
+
"t5": "t5-small",
|
|
24
|
+
}
|
|
25
|
+
resolved = models.get(model_name.lower(), model_name)
|
|
26
|
+
print(f" Loading pretrained tokenizer: {resolved} ...")
|
|
27
|
+
return AutoTokenizer.from_pretrained(resolved)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _fallback_word_tokenize(text: str) -> List[int]:
|
|
31
|
+
words = _simple_pretokenize(text)
|
|
32
|
+
vocab: dict = {}
|
|
33
|
+
ids: List[int] = []
|
|
34
|
+
for w in words:
|
|
35
|
+
if w not in vocab:
|
|
36
|
+
vocab[w] = len(vocab)
|
|
37
|
+
ids.append(vocab[w])
|
|
38
|
+
print(f" Fallback tokenizer: vocab_size={len(vocab)}, tokens={len(ids):,}")
|
|
39
|
+
return ids
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def tokenize_pretrained(text: str, model_name: str = "gpt2", save_dir: Optional[str] = None) -> List[int]:
|
|
43
|
+
try:
|
|
44
|
+
tok = _load_hf_tokenizer(model_name)
|
|
45
|
+
ids = tok.encode(text, add_special_tokens=False)
|
|
46
|
+
if save_dir:
|
|
47
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
48
|
+
tok.save_pretrained(save_dir)
|
|
49
|
+
print(f" Pretrained tokenizer produced {len(ids):,} tokens.")
|
|
50
|
+
return ids
|
|
51
|
+
except ImportError:
|
|
52
|
+
print(" [WARN] transformers not available, using fallback word tokenizer.")
|
|
53
|
+
return _fallback_word_tokenize(text)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _get_vocab(text: str) -> dict:
|
|
57
|
+
vocab: dict = collections.Counter()
|
|
58
|
+
for word in _simple_pretokenize(text):
|
|
59
|
+
chars = " ".join(list(word)) + " </w>"
|
|
60
|
+
vocab[chars] += 1
|
|
61
|
+
return dict(vocab)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _get_pairs(vocab: dict) -> dict:
|
|
65
|
+
pairs: dict = collections.Counter()
|
|
66
|
+
for word, freq in vocab.items():
|
|
67
|
+
symbols = word.split()
|
|
68
|
+
for i in range(len(symbols) - 1):
|
|
69
|
+
pairs[(symbols[i], symbols[i + 1])] += freq
|
|
70
|
+
return pairs
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _merge_vocab(pair, vocab: dict) -> dict:
|
|
74
|
+
new_vocab: dict = {}
|
|
75
|
+
bigram = re.escape(" ".join(pair))
|
|
76
|
+
pattern = re.compile(r"(?<!\S)" + bigram + r"(?!\S)")
|
|
77
|
+
for word in vocab:
|
|
78
|
+
new_vocab[pattern.sub("".join(pair), word)] = vocab[word]
|
|
79
|
+
return new_vocab
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _pure_python_bpe(text: str, vocab_size: int = 1000, save_dir: Optional[str] = None) -> List[int]:
|
|
83
|
+
print(f" Training pure-Python BPE (vocab_size={vocab_size}) ...")
|
|
84
|
+
vocab = _get_vocab(text)
|
|
85
|
+
symbols: set = set()
|
|
86
|
+
for word in vocab:
|
|
87
|
+
symbols.update(word.split())
|
|
88
|
+
merges = []
|
|
89
|
+
while len(symbols) < vocab_size:
|
|
90
|
+
pairs = _get_pairs(vocab)
|
|
91
|
+
if not pairs:
|
|
92
|
+
break
|
|
93
|
+
best = max(pairs, key=pairs.get)
|
|
94
|
+
vocab = _merge_vocab(best, vocab)
|
|
95
|
+
symbols.add("".join(best))
|
|
96
|
+
merges.append(best)
|
|
97
|
+
word_to_id = {s: i for i, s in enumerate(sorted(symbols))}
|
|
98
|
+
ids = []
|
|
99
|
+
for word in _simple_pretokenize(text):
|
|
100
|
+
chars = list(word)
|
|
101
|
+
for a, b in merges:
|
|
102
|
+
i = 0
|
|
103
|
+
new_chars = []
|
|
104
|
+
while i < len(chars):
|
|
105
|
+
if i < len(chars) - 1 and chars[i] == a and chars[i+1] == b:
|
|
106
|
+
new_chars.append(a + b)
|
|
107
|
+
i += 2
|
|
108
|
+
else:
|
|
109
|
+
new_chars.append(chars[i])
|
|
110
|
+
i += 1
|
|
111
|
+
chars = new_chars
|
|
112
|
+
for c in chars:
|
|
113
|
+
ids.append(word_to_id.get(c, 0))
|
|
114
|
+
if save_dir:
|
|
115
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
116
|
+
with open(os.path.join(save_dir, "bpe_vocab.json"), "w") as f:
|
|
117
|
+
json.dump(word_to_id, f, indent=2)
|
|
118
|
+
print(f" Pure-Python BPE: vocab={len(symbols)}, tokens={len(ids):,}")
|
|
119
|
+
return ids
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def tokenize_custom(text: str, vocab_size: int = 8000, save_dir: Optional[str] = None) -> List[int]:
|
|
123
|
+
try:
|
|
124
|
+
import tokenizers # noqa
|
|
125
|
+
from tokenizers import Tokenizer
|
|
126
|
+
from tokenizers.models import BPE
|
|
127
|
+
from tokenizers.trainers import BpeTrainer
|
|
128
|
+
from tokenizers.pre_tokenizers import Whitespace
|
|
129
|
+
print(f" Training HF BPE tokenizer (vocab_size={vocab_size}) ...")
|
|
130
|
+
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
|
131
|
+
tokenizer.pre_tokenizer = Whitespace()
|
|
132
|
+
trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]"], min_frequency=2)
|
|
133
|
+
tokenizer.train_from_iterator([text], trainer=trainer)
|
|
134
|
+
if save_dir:
|
|
135
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
136
|
+
tokenizer.save(os.path.join(save_dir, "tokenizer.json"))
|
|
137
|
+
ids = tokenizer.encode(text).ids
|
|
138
|
+
print(f" HF BPE: vocab={tokenizer.get_vocab_size()}, tokens={len(ids):,}")
|
|
139
|
+
return ids
|
|
140
|
+
except ImportError:
|
|
141
|
+
print(" [WARN] tokenizers library not found, using pure-Python BPE.")
|
|
142
|
+
return _pure_python_bpe(text, vocab_size=vocab_size, save_dir=save_dir)
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: suur_data
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Intelligent data ingestion and tokenization pipeline
|
|
5
|
+
Author: Your Name
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: requests
|
|
11
|
+
Requires-Dist: beautifulsoup4
|
|
12
|
+
Requires-Dist: scikit-learn
|
|
13
|
+
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: click
|
|
15
|
+
Requires-Dist: chardet
|
|
16
|
+
Provides-Extra: pdf
|
|
17
|
+
Requires-Dist: pdfminer.six; extra == "pdf"
|
|
18
|
+
Provides-Extra: docx
|
|
19
|
+
Requires-Dist: python-docx; extra == "docx"
|
|
20
|
+
Provides-Extra: epub
|
|
21
|
+
Requires-Dist: ebooklib; extra == "epub"
|
|
22
|
+
Provides-Extra: hf
|
|
23
|
+
Requires-Dist: transformers; extra == "hf"
|
|
24
|
+
Requires-Dist: tokenizers; extra == "hf"
|
|
25
|
+
Provides-Extra: all
|
|
26
|
+
Requires-Dist: pdfminer.six; extra == "all"
|
|
27
|
+
Requires-Dist: python-docx; extra == "all"
|
|
28
|
+
Requires-Dist: ebooklib; extra == "all"
|
|
29
|
+
Requires-Dist: transformers; extra == "all"
|
|
30
|
+
Requires-Dist: tokenizers; extra == "all"
|
|
31
|
+
Dynamic: author
|
|
32
|
+
Dynamic: classifier
|
|
33
|
+
Dynamic: description
|
|
34
|
+
Dynamic: description-content-type
|
|
35
|
+
Dynamic: provides-extra
|
|
36
|
+
Dynamic: requires-dist
|
|
37
|
+
Dynamic: requires-python
|
|
38
|
+
Dynamic: summary
|
|
39
|
+
|
|
40
|
+
# Suur Data
|
|
41
|
+
|
|
42
|
+
**Intelligent data ingestion and tokenization pipeline.**
|
|
43
|
+
|
|
44
|
+
Suur Data fetches text from any source, filters it by topic using a neural relevance scorer, then tokenizes it using either a pretrained HuggingFace tokenizer or a custom-trained BPE tokenizer.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Core (URLs, .txt, .csv, .json, .html)
|
|
52
|
+
pip install -e .
|
|
53
|
+
|
|
54
|
+
# With all optional formats + HuggingFace tokenizers
|
|
55
|
+
pip install -e ".[all]"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Python API
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from suur_data import suur_data
|
|
64
|
+
|
|
65
|
+
# Minimal — fetches URL, no filter, GPT-2 tokenizer
|
|
66
|
+
tokens = suur_data("https://en.wikipedia.org/wiki/Neuroscience")
|
|
67
|
+
|
|
68
|
+
# Filter by topic, custom BPE tokenizer
|
|
69
|
+
tokens = suur_data(
|
|
70
|
+
"research_paper.pdf",
|
|
71
|
+
topic="quantum computing",
|
|
72
|
+
tokenizer="custom",
|
|
73
|
+
vocab_size=4000,
|
|
74
|
+
save_dir="./my_tokenizer",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Local file, pretrained BERT tokenizer, strict filter
|
|
78
|
+
tokens = suur_data(
|
|
79
|
+
"~/corpus/biology.txt",
|
|
80
|
+
topic="cell biology",
|
|
81
|
+
tokenizer="pretrained",
|
|
82
|
+
model="bert",
|
|
83
|
+
threshold=0.10,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Skip the filter entirely
|
|
87
|
+
tokens = suur_data("data.csv", no_filter=True)
|
|
88
|
+
|
|
89
|
+
print(tokens[:20]) # list of integer token IDs
|
|
90
|
+
print(len(tokens)) # total token count
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Parameters
|
|
94
|
+
|
|
95
|
+
| Parameter | Type | Default | Description |
|
|
96
|
+
|-----------|------|---------|-------------|
|
|
97
|
+
| `data_location` | str | — | URL or local file path |
|
|
98
|
+
| `topic` | str | `""` | Subject for relevance filtering (empty = skip filter) |
|
|
99
|
+
| `tokenizer` | str | `"pretrained"` | `"pretrained"` or `"custom"` |
|
|
100
|
+
| `model` | str | `"gpt2"` | HuggingFace model shortcut or full ID |
|
|
101
|
+
| `vocab_size` | int | `8000` | BPE vocab size for custom tokenizer |
|
|
102
|
+
| `threshold` | float | `0.05`` | Cosine similarity cutoff (0.0–1.0) |
|
|
103
|
+
| `save_dir` | str | `None` | Path to save tokenizer files |
|
|
104
|
+
| `no_filter` | bool | `False` | Skip the relevance filter |
|
|
105
|
+
| `verbose` | bool | `True` | Show progress output |
|
|
106
|
+
|
|
107
|
+
### Returns
|
|
108
|
+
`List[int]` — flat list of integer token IDs.
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## CLI
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
# Basic URL fetch
|
|
116
|
+
suur_data fetch https://example.com/article --topic "machine learning"
|
|
117
|
+
|
|
118
|
+
# PDF with custom BPE tokenizer
|
|
119
|
+
suur_data fetch paper.pdf --topic "protein folding" --tokenizer custom --vocab-size 6000
|
|
120
|
+
|
|
121
|
+
# Local file, pretrained BERT, save tokenizer
|
|
122
|
+
suur_data fetch corpus.txt --tokenizer pretrained --model bert --save-dir ./bert_tok
|
|
123
|
+
|
|
124
|
+
# Skip filter, save tokens to file
|
|
125
|
+
suur_data fetch data.json --no-filter --output tokens.json
|
|
126
|
+
|
|
127
|
+
# See supported models
|
|
128
|
+
suur_data models
|
|
129
|
+
|
|
130
|
+
# See supported file formats
|
|
131
|
+
suur_data formats
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Supported Input Formats
|
|
137
|
+
|
|
138
|
+
| Format | Notes |
|
|
139
|
+
|--------|-------|
|
|
140
|
+
| `.txt`, `.md`, `.rst` | Plain text |
|
|
141
|
+
| `.pdf` | Requires `pdfminer.six` |
|
|
142
|
+
| `.docx` | Requires `python-docx` |
|
|
143
|
+
| `.csv`, `.tsv` | All cells joined as text |
|
|
144
|
+
| `.json` | Recursively flattened key-value pairs |
|
|
145
|
+
| `.html`, `.htm` | Scripts/styles stripped (requires `beautifulsoup4`) |
|
|
146
|
+
| `.epub` | E-books (requires `ebooklib` + `beautifulsoup4`) |
|
|
147
|
+
| HTTP/HTTPS URL | Auto-downloaded, then parsed by extension |
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## Pretrained Model Shortcuts
|
|
152
|
+
|
|
153
|
+
| Shortcut | Model |
|
|
154
|
+
|----------|-------|
|
|
155
|
+
| `gpt2` | GPT-2 (OpenAI) |
|
|
156
|
+
| `bert` | BERT base uncased |
|
|
157
|
+
| `roberta` | RoBERTa base |
|
|
158
|
+
| `distilbert` | DistilBERT base uncased |
|
|
159
|
+
| `t5` | T5 small |
|
|
160
|
+
|
|
161
|
+
You can also pass any HuggingFace Hub model ID directly:
|
|
162
|
+
```
|
|
163
|
+
--model "facebook/opt-125m"
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Architecture
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
Source (URL / file)
|
|
172
|
+
│
|
|
173
|
+
▼
|
|
174
|
+
Stage 1: Ingest
|
|
175
|
+
Handles 8 file types + HTTP download
|
|
176
|
+
│
|
|
177
|
+
▼
|
|
178
|
+
Stage 2: Neural Filter
|
|
179
|
+
Splits into paragraph chunks
|
|
180
|
+
Scores each chunk against topic via TF-IDF cosine similarity
|
|
181
|
+
Drops chunks below threshold
|
|
182
|
+
│
|
|
183
|
+
▼
|
|
184
|
+
Stage 3: Tokenize
|
|
185
|
+
┌─────────────────────┐ ┌────────────────────────────┐
|
|
186
|
+
│ Pretrained mode │ │ Custom mode │
|
|
187
|
+
│ HuggingFace │ │ BPE trainer (HF library │
|
|
188
|
+
│ AutoTokenizer │ │ or pure-Python fallback) │
|
|
189
|
+
└─────────────────────┘ └────────────────────────────┘
|
|
190
|
+
│
|
|
191
|
+
▼
|
|
192
|
+
List[int] ← token IDs
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## Dependency Matrix
|
|
198
|
+
|
|
199
|
+
| Feature | Required packages |
|
|
200
|
+
|---------|------------------|
|
|
201
|
+
| Core pipeline | `requests`, `beautifulsoup4`, `scikit-learn`, `numpy`, `click`, `chardet` |
|
|
202
|
+
| PDF support | `pdfminer.six` |
|
|
203
|
+
| .docx support | `python-docx` |
|
|
204
|
+
| .epub support | `ebooklib` |
|
|
205
|
+
| Pretrained tokenizers | `transformers` |
|
|
206
|
+
| Fast BPE training | `tokenizers` |
|
|
207
|
+
|
|
208
|
+
All optional — the tool degrades gracefully with built-in fallbacks when optional packages are missing.
|
|
209
|
+
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
suur_data/__init__.py,sha256=NN4R90oZcJIzHtwn8L3sksZ0bFAVQTbvQF-6--hwIvU,8064
|
|
2
|
+
suur_data/filter.py,sha256=HCp_j35kC2UeKV_QthbW5SV5NUwS2F9UKF72lxIWljM,5804
|
|
3
|
+
suur_data/ingest.py,sha256=c2d5d4wVawxdd2FG_Ecsxb5bRLvGz-PiFAHcIoPvxZo,4811
|
|
4
|
+
suur_data/tokenizer.py,sha256=uDnGTKBigXdMQkPcJm-ebC62Sim7fHQnsTOavy6Ir5k,4989
|
|
5
|
+
suur_data-1.0.0.dist-info/METADATA,sha256=rmBT0QEdEvx6A_EdMrQYM7e8LIl-Tz6VL83tFFFC9A0,5856
|
|
6
|
+
suur_data-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
7
|
+
suur_data-1.0.0.dist-info/entry_points.txt,sha256=TKKBMSX0wa1BroERW5ftHglvR2-Fy_8fxij3XmqnO7o,45
|
|
8
|
+
suur_data-1.0.0.dist-info/top_level.txt,sha256=8ooHxzPfqZ56sFPb_U6tRTKdxXBlsBrZOhDHWO9g_x4,10
|
|
9
|
+
suur_data-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
suur_data
|