suur-data 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- suur_data-1.0.0/PKG-INFO +209 -0
- suur_data-1.0.0/README.md +170 -0
- suur_data-1.0.0/setup.cfg +4 -0
- suur_data-1.0.0/setup.py +43 -0
- suur_data-1.0.0/suur_data/__init__.py +233 -0
- suur_data-1.0.0/suur_data/filter.py +183 -0
- suur_data-1.0.0/suur_data/ingest.py +165 -0
- suur_data-1.0.0/suur_data/tokenizer.py +142 -0
- suur_data-1.0.0/suur_data.egg-info/PKG-INFO +209 -0
- suur_data-1.0.0/suur_data.egg-info/SOURCES.txt +12 -0
- suur_data-1.0.0/suur_data.egg-info/dependency_links.txt +1 -0
- suur_data-1.0.0/suur_data.egg-info/entry_points.txt +2 -0
- suur_data-1.0.0/suur_data.egg-info/requires.txt +26 -0
- suur_data-1.0.0/suur_data.egg-info/top_level.txt +1 -0
suur_data-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: suur_data
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Intelligent data ingestion and tokenization pipeline
|
|
5
|
+
Author: Your Name
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: requests
|
|
11
|
+
Requires-Dist: beautifulsoup4
|
|
12
|
+
Requires-Dist: scikit-learn
|
|
13
|
+
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: click
|
|
15
|
+
Requires-Dist: chardet
|
|
16
|
+
Provides-Extra: pdf
|
|
17
|
+
Requires-Dist: pdfminer.six; extra == "pdf"
|
|
18
|
+
Provides-Extra: docx
|
|
19
|
+
Requires-Dist: python-docx; extra == "docx"
|
|
20
|
+
Provides-Extra: epub
|
|
21
|
+
Requires-Dist: ebooklib; extra == "epub"
|
|
22
|
+
Provides-Extra: hf
|
|
23
|
+
Requires-Dist: transformers; extra == "hf"
|
|
24
|
+
Requires-Dist: tokenizers; extra == "hf"
|
|
25
|
+
Provides-Extra: all
|
|
26
|
+
Requires-Dist: pdfminer.six; extra == "all"
|
|
27
|
+
Requires-Dist: python-docx; extra == "all"
|
|
28
|
+
Requires-Dist: ebooklib; extra == "all"
|
|
29
|
+
Requires-Dist: transformers; extra == "all"
|
|
30
|
+
Requires-Dist: tokenizers; extra == "all"
|
|
31
|
+
Dynamic: author
|
|
32
|
+
Dynamic: classifier
|
|
33
|
+
Dynamic: description
|
|
34
|
+
Dynamic: description-content-type
|
|
35
|
+
Dynamic: provides-extra
|
|
36
|
+
Dynamic: requires-dist
|
|
37
|
+
Dynamic: requires-python
|
|
38
|
+
Dynamic: summary
|
|
39
|
+
|
|
40
|
+
# Suur Data
|
|
41
|
+
|
|
42
|
+
**Intelligent data ingestion and tokenization pipeline.**
|
|
43
|
+
|
|
44
|
+
Suur Data fetches text from any source, filters it by topic using a neural relevance scorer, then tokenizes it using either a pretrained HuggingFace tokenizer or a custom-trained BPE tokenizer.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Core (URLs, .txt, .csv, .json, .html)
|
|
52
|
+
pip install -e .
|
|
53
|
+
|
|
54
|
+
# With all optional formats + HuggingFace tokenizers
|
|
55
|
+
pip install -e ".[all]"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Python API
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from suur_data import suur_data
|
|
64
|
+
|
|
65
|
+
# Minimal — fetches URL, no filter, GPT-2 tokenizer
|
|
66
|
+
tokens = suur_data("https://en.wikipedia.org/wiki/Neuroscience")
|
|
67
|
+
|
|
68
|
+
# Filter by topic, custom BPE tokenizer
|
|
69
|
+
tokens = suur_data(
|
|
70
|
+
"research_paper.pdf",
|
|
71
|
+
topic="quantum computing",
|
|
72
|
+
tokenizer="custom",
|
|
73
|
+
vocab_size=4000,
|
|
74
|
+
save_dir="./my_tokenizer",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Local file, pretrained BERT tokenizer, strict filter
|
|
78
|
+
tokens = suur_data(
|
|
79
|
+
"~/corpus/biology.txt",
|
|
80
|
+
topic="cell biology",
|
|
81
|
+
tokenizer="pretrained",
|
|
82
|
+
model="bert",
|
|
83
|
+
threshold=0.10,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Skip the filter entirely
|
|
87
|
+
tokens = suur_data("data.csv", no_filter=True)
|
|
88
|
+
|
|
89
|
+
print(tokens[:20]) # list of integer token IDs
|
|
90
|
+
print(len(tokens)) # total token count
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Parameters
|
|
94
|
+
|
|
95
|
+
| Parameter | Type | Default | Description |
|
|
96
|
+
|-----------|------|---------|-------------|
|
|
97
|
+
| `data_location` | str | — | URL or local file path |
|
|
98
|
+
| `topic` | str | `""` | Subject for relevance filtering (empty = skip filter) |
|
|
99
|
+
| `tokenizer` | str | `"pretrained"` | `"pretrained"` or `"custom"` |
|
|
100
|
+
| `model` | str | `"gpt2"` | HuggingFace model shortcut or full ID |
|
|
101
|
+
| `vocab_size` | int | `8000` | BPE vocab size for custom tokenizer |
|
|
102
|
+
| `threshold` | float | `0.05`` | Cosine similarity cutoff (0.0–1.0) |
|
|
103
|
+
| `save_dir` | str | `None` | Path to save tokenizer files |
|
|
104
|
+
| `no_filter` | bool | `False` | Skip the relevance filter |
|
|
105
|
+
| `verbose` | bool | `True` | Show progress output |
|
|
106
|
+
|
|
107
|
+
### Returns
|
|
108
|
+
`List[int]` — flat list of integer token IDs.
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## CLI
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
# Basic URL fetch
|
|
116
|
+
suur_data fetch https://example.com/article --topic "machine learning"
|
|
117
|
+
|
|
118
|
+
# PDF with custom BPE tokenizer
|
|
119
|
+
suur_data fetch paper.pdf --topic "protein folding" --tokenizer custom --vocab-size 6000
|
|
120
|
+
|
|
121
|
+
# Local file, pretrained BERT, save tokenizer
|
|
122
|
+
suur_data fetch corpus.txt --tokenizer pretrained --model bert --save-dir ./bert_tok
|
|
123
|
+
|
|
124
|
+
# Skip filter, save tokens to file
|
|
125
|
+
suur_data fetch data.json --no-filter --output tokens.json
|
|
126
|
+
|
|
127
|
+
# See supported models
|
|
128
|
+
suur_data models
|
|
129
|
+
|
|
130
|
+
# See supported file formats
|
|
131
|
+
suur_data formats
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Supported Input Formats
|
|
137
|
+
|
|
138
|
+
| Format | Notes |
|
|
139
|
+
|--------|-------|
|
|
140
|
+
| `.txt`, `.md`, `.rst` | Plain text |
|
|
141
|
+
| `.pdf` | Requires `pdfminer.six` |
|
|
142
|
+
| `.docx` | Requires `python-docx` |
|
|
143
|
+
| `.csv`, `.tsv` | All cells joined as text |
|
|
144
|
+
| `.json` | Recursively flattened key-value pairs |
|
|
145
|
+
| `.html`, `.htm` | Scripts/styles stripped (requires `beautifulsoup4`) |
|
|
146
|
+
| `.epub` | E-books (requires `ebooklib` + `beautifulsoup4`) |
|
|
147
|
+
| HTTP/HTTPS URL | Auto-downloaded, then parsed by extension |
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## Pretrained Model Shortcuts
|
|
152
|
+
|
|
153
|
+
| Shortcut | Model |
|
|
154
|
+
|----------|-------|
|
|
155
|
+
| `gpt2` | GPT-2 (OpenAI) |
|
|
156
|
+
| `bert` | BERT base uncased |
|
|
157
|
+
| `roberta` | RoBERTa base |
|
|
158
|
+
| `distilbert` | DistilBERT base uncased |
|
|
159
|
+
| `t5` | T5 small |
|
|
160
|
+
|
|
161
|
+
You can also pass any HuggingFace Hub model ID directly:
|
|
162
|
+
```
|
|
163
|
+
--model "facebook/opt-125m"
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Architecture
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
Source (URL / file)
|
|
172
|
+
│
|
|
173
|
+
▼
|
|
174
|
+
Stage 1: Ingest
|
|
175
|
+
Handles 8 file types + HTTP download
|
|
176
|
+
│
|
|
177
|
+
▼
|
|
178
|
+
Stage 2: Neural Filter
|
|
179
|
+
Splits into paragraph chunks
|
|
180
|
+
Scores each chunk against topic via TF-IDF cosine similarity
|
|
181
|
+
Drops chunks below threshold
|
|
182
|
+
│
|
|
183
|
+
▼
|
|
184
|
+
Stage 3: Tokenize
|
|
185
|
+
┌─────────────────────┐ ┌────────────────────────────┐
|
|
186
|
+
│ Pretrained mode │ │ Custom mode │
|
|
187
|
+
│ HuggingFace │ │ BPE trainer (HF library │
|
|
188
|
+
│ AutoTokenizer │ │ or pure-Python fallback) │
|
|
189
|
+
└─────────────────────┘ └────────────────────────────┘
|
|
190
|
+
│
|
|
191
|
+
▼
|
|
192
|
+
List[int] ← token IDs
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## Dependency Matrix
|
|
198
|
+
|
|
199
|
+
| Feature | Required packages |
|
|
200
|
+
|---------|------------------|
|
|
201
|
+
| Core pipeline | `requests`, `beautifulsoup4`, `scikit-learn`, `numpy`, `click`, `chardet` |
|
|
202
|
+
| PDF support | `pdfminer.six` |
|
|
203
|
+
| .docx support | `python-docx` |
|
|
204
|
+
| .epub support | `ebooklib` |
|
|
205
|
+
| Pretrained tokenizers | `transformers` |
|
|
206
|
+
| Fast BPE training | `tokenizers` |
|
|
207
|
+
|
|
208
|
+
All optional — the tool degrades gracefully with built-in fallbacks when optional packages are missing.
|
|
209
|
+
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# Suur Data
|
|
2
|
+
|
|
3
|
+
**Intelligent data ingestion and tokenization pipeline.**
|
|
4
|
+
|
|
5
|
+
Suur Data fetches text from any source, filters it by topic using a neural relevance scorer, then tokenizes it using either a pretrained HuggingFace tokenizer or a custom-trained BPE tokenizer.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Core (URLs, .txt, .csv, .json, .html)
|
|
13
|
+
pip install -e .
|
|
14
|
+
|
|
15
|
+
# With all optional formats + HuggingFace tokenizers
|
|
16
|
+
pip install -e ".[all]"
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Python API
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from suur_data import suur_data
|
|
25
|
+
|
|
26
|
+
# Minimal — fetches URL, no filter, GPT-2 tokenizer
|
|
27
|
+
tokens = suur_data("https://en.wikipedia.org/wiki/Neuroscience")
|
|
28
|
+
|
|
29
|
+
# Filter by topic, custom BPE tokenizer
|
|
30
|
+
tokens = suur_data(
|
|
31
|
+
"research_paper.pdf",
|
|
32
|
+
topic="quantum computing",
|
|
33
|
+
tokenizer="custom",
|
|
34
|
+
vocab_size=4000,
|
|
35
|
+
save_dir="./my_tokenizer",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Local file, pretrained BERT tokenizer, strict filter
|
|
39
|
+
tokens = suur_data(
|
|
40
|
+
"~/corpus/biology.txt",
|
|
41
|
+
topic="cell biology",
|
|
42
|
+
tokenizer="pretrained",
|
|
43
|
+
model="bert",
|
|
44
|
+
threshold=0.10,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Skip the filter entirely
|
|
48
|
+
tokens = suur_data("data.csv", no_filter=True)
|
|
49
|
+
|
|
50
|
+
print(tokens[:20]) # list of integer token IDs
|
|
51
|
+
print(len(tokens)) # total token count
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Parameters
|
|
55
|
+
|
|
56
|
+
| Parameter | Type | Default | Description |
|
|
57
|
+
|-----------|------|---------|-------------|
|
|
58
|
+
| `data_location` | str | — | URL or local file path |
|
|
59
|
+
| `topic` | str | `""` | Subject for relevance filtering (empty = skip filter) |
|
|
60
|
+
| `tokenizer` | str | `"pretrained"` | `"pretrained"` or `"custom"` |
|
|
61
|
+
| `model` | str | `"gpt2"` | HuggingFace model shortcut or full ID |
|
|
62
|
+
| `vocab_size` | int | `8000` | BPE vocab size for custom tokenizer |
|
|
63
|
+
| `threshold` | float | `0.05`` | Cosine similarity cutoff (0.0–1.0) |
|
|
64
|
+
| `save_dir` | str | `None` | Path to save tokenizer files |
|
|
65
|
+
| `no_filter` | bool | `False` | Skip the relevance filter |
|
|
66
|
+
| `verbose` | bool | `True` | Show progress output |
|
|
67
|
+
|
|
68
|
+
### Returns
|
|
69
|
+
`List[int]` — flat list of integer token IDs.
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## CLI
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Basic URL fetch
|
|
77
|
+
suur_data fetch https://example.com/article --topic "machine learning"
|
|
78
|
+
|
|
79
|
+
# PDF with custom BPE tokenizer
|
|
80
|
+
suur_data fetch paper.pdf --topic "protein folding" --tokenizer custom --vocab-size 6000
|
|
81
|
+
|
|
82
|
+
# Local file, pretrained BERT, save tokenizer
|
|
83
|
+
suur_data fetch corpus.txt --tokenizer pretrained --model bert --save-dir ./bert_tok
|
|
84
|
+
|
|
85
|
+
# Skip filter, save tokens to file
|
|
86
|
+
suur_data fetch data.json --no-filter --output tokens.json
|
|
87
|
+
|
|
88
|
+
# See supported models
|
|
89
|
+
suur_data models
|
|
90
|
+
|
|
91
|
+
# See supported file formats
|
|
92
|
+
suur_data formats
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Supported Input Formats
|
|
98
|
+
|
|
99
|
+
| Format | Notes |
|
|
100
|
+
|--------|-------|
|
|
101
|
+
| `.txt`, `.md`, `.rst` | Plain text |
|
|
102
|
+
| `.pdf` | Requires `pdfminer.six` |
|
|
103
|
+
| `.docx` | Requires `python-docx` |
|
|
104
|
+
| `.csv`, `.tsv` | All cells joined as text |
|
|
105
|
+
| `.json` | Recursively flattened key-value pairs |
|
|
106
|
+
| `.html`, `.htm` | Scripts/styles stripped (requires `beautifulsoup4`) |
|
|
107
|
+
| `.epub` | E-books (requires `ebooklib` + `beautifulsoup4`) |
|
|
108
|
+
| HTTP/HTTPS URL | Auto-downloaded, then parsed by extension |
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## Pretrained Model Shortcuts
|
|
113
|
+
|
|
114
|
+
| Shortcut | Model |
|
|
115
|
+
|----------|-------|
|
|
116
|
+
| `gpt2` | GPT-2 (OpenAI) |
|
|
117
|
+
| `bert` | BERT base uncased |
|
|
118
|
+
| `roberta` | RoBERTa base |
|
|
119
|
+
| `distilbert` | DistilBERT base uncased |
|
|
120
|
+
| `t5` | T5 small |
|
|
121
|
+
|
|
122
|
+
You can also pass any HuggingFace Hub model ID directly:
|
|
123
|
+
```
|
|
124
|
+
--model "facebook/opt-125m"
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## Architecture
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
Source (URL / file)
|
|
133
|
+
│
|
|
134
|
+
▼
|
|
135
|
+
Stage 1: Ingest
|
|
136
|
+
Handles 8 file types + HTTP download
|
|
137
|
+
│
|
|
138
|
+
▼
|
|
139
|
+
Stage 2: Neural Filter
|
|
140
|
+
Splits into paragraph chunks
|
|
141
|
+
Scores each chunk against topic via TF-IDF cosine similarity
|
|
142
|
+
Drops chunks below threshold
|
|
143
|
+
│
|
|
144
|
+
▼
|
|
145
|
+
Stage 3: Tokenize
|
|
146
|
+
┌─────────────────────┐ ┌────────────────────────────┐
|
|
147
|
+
│ Pretrained mode │ │ Custom mode │
|
|
148
|
+
│ HuggingFace │ │ BPE trainer (HF library │
|
|
149
|
+
│ AutoTokenizer │ │ or pure-Python fallback) │
|
|
150
|
+
└─────────────────────┘ └────────────────────────────┘
|
|
151
|
+
│
|
|
152
|
+
▼
|
|
153
|
+
List[int] ← token IDs
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Dependency Matrix
|
|
159
|
+
|
|
160
|
+
| Feature | Required packages |
|
|
161
|
+
|---------|------------------|
|
|
162
|
+
| Core pipeline | `requests`, `beautifulsoup4`, `scikit-learn`, `numpy`, `click`, `chardet` |
|
|
163
|
+
| PDF support | `pdfminer.six` |
|
|
164
|
+
| .docx support | `python-docx` |
|
|
165
|
+
| .epub support | `ebooklib` |
|
|
166
|
+
| Pretrained tokenizers | `transformers` |
|
|
167
|
+
| Fast BPE training | `tokenizers` |
|
|
168
|
+
|
|
169
|
+
All optional — the tool degrades gracefully with built-in fallbacks when optional packages are missing.
|
|
170
|
+
|
suur_data-1.0.0/setup.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="suur_data",
|
|
5
|
+
version="1.0.0",
|
|
6
|
+
description="Intelligent data ingestion and tokenization pipeline",
|
|
7
|
+
long_description=open("README.md").read(),
|
|
8
|
+
long_description_content_type="text/markdown",
|
|
9
|
+
author="Your Name",
|
|
10
|
+
python_requires=">=3.10",
|
|
11
|
+
packages=find_packages(),
|
|
12
|
+
install_requires=[
|
|
13
|
+
"requests",
|
|
14
|
+
"beautifulsoup4",
|
|
15
|
+
"scikit-learn",
|
|
16
|
+
"numpy",
|
|
17
|
+
"click",
|
|
18
|
+
"chardet",
|
|
19
|
+
],
|
|
20
|
+
extras_require={
|
|
21
|
+
"pdf": ["pdfminer.six"],
|
|
22
|
+
"docx": ["python-docx"],
|
|
23
|
+
"epub": ["ebooklib"],
|
|
24
|
+
"hf": ["transformers", "tokenizers"], # pretrained + HF BPE
|
|
25
|
+
"all": [
|
|
26
|
+
"pdfminer.six",
|
|
27
|
+
"python-docx",
|
|
28
|
+
"ebooklib",
|
|
29
|
+
"transformers",
|
|
30
|
+
"tokenizers",
|
|
31
|
+
],
|
|
32
|
+
},
|
|
33
|
+
entry_points={
|
|
34
|
+
"console_scripts": [
|
|
35
|
+
"suur-data=suur_data:main",
|
|
36
|
+
],
|
|
37
|
+
},
|
|
38
|
+
classifiers=[
|
|
39
|
+
"Programming Language :: Python :: 3",
|
|
40
|
+
"Topic :: Text Processing :: Linguistic",
|
|
41
|
+
],
|
|
42
|
+
)
|
|
43
|
+
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""
|
|
2
|
+
suur_data
|
|
3
|
+
========
|
|
4
|
+
A smart data ingestion and tokenization pipeline.
|
|
5
|
+
|
|
6
|
+
Python API usage:
|
|
7
|
+
from suur_data import suur_data
|
|
8
|
+
|
|
9
|
+
tokens = suur_data("https://example.com/article.html")
|
|
10
|
+
tokens = suur_data("my_corpus.pdf", topic="quantum computing")
|
|
11
|
+
tokens = suur_data("data.txt", tokenizer="custom", vocab_size=4000)
|
|
12
|
+
|
|
13
|
+
CLI usage:
|
|
14
|
+
suur_data fetch <source> [OPTIONS]
|
|
15
|
+
|
|
16
|
+
Options:
|
|
17
|
+
--topic TEXT Topic/subject to filter content by
|
|
18
|
+
--tokenizer [pretrained|custom]
|
|
19
|
+
--model TEXT Pretrained model name (default: gpt2)
|
|
20
|
+
--vocab-size INT BPE vocab size for custom mode (default: 8000)
|
|
21
|
+
--threshold FLOAT Relevance threshold 0.0–1.0 (default: 0.05)
|
|
22
|
+
--save-dir PATH Where to save tokenizer artifacts
|
|
23
|
+
--no-filter Skip the relevance filter
|
|
24
|
+
--verbose / --quiet
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import os
|
|
30
|
+
import sys
|
|
31
|
+
import json
|
|
32
|
+
import tempfile
|
|
33
|
+
from typing import List, Optional
|
|
34
|
+
|
|
35
|
+
import click
|
|
36
|
+
|
|
37
|
+
from .ingest import ingest
|
|
38
|
+
from .filter import filter_chunks
|
|
39
|
+
from .tokenizer import tokenize_pretrained, tokenize_custom
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
# Python API
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
def suur_data(
|
|
47
|
+
data_location: str,
|
|
48
|
+
topic: str = "",
|
|
49
|
+
tokenizer: str = "pretrained",
|
|
50
|
+
model: str = "gpt2",
|
|
51
|
+
vocab_size: int = 8000,
|
|
52
|
+
threshold: float = 0.05,
|
|
53
|
+
save_dir: Optional[str] = None,
|
|
54
|
+
no_filter: bool = False,
|
|
55
|
+
verbose: bool = True,
|
|
56
|
+
) -> List[int]:
|
|
57
|
+
"""
|
|
58
|
+
End-to-end pipeline: ingest → filter → tokenize.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
data_location : str
|
|
63
|
+
URL or local file path. Supports .txt, .pdf, .docx, .csv,
|
|
64
|
+
.json, .html, .htm, .epub, .md, .rst
|
|
65
|
+
topic : str
|
|
66
|
+
Subject/keyword for the neural relevance filter.
|
|
67
|
+
Leave empty to skip filtering.
|
|
68
|
+
tokenizer : str
|
|
69
|
+
"pretrained" (default) or "custom".
|
|
70
|
+
model : str
|
|
71
|
+
HuggingFace model name for pretrained mode (default: "gpt2").
|
|
72
|
+
vocab_size : int
|
|
73
|
+
BPE vocabulary size for custom mode (default: 8000).
|
|
74
|
+
threshold : float
|
|
75
|
+
Cosine similarity cutoff for relevance filter (default: 0.05).
|
|
76
|
+
save_dir : str | None
|
|
77
|
+
Directory to save tokenizer files. None = don't save.
|
|
78
|
+
no_filter : bool
|
|
79
|
+
If True, skip the relevance filter and tokenize everything.
|
|
80
|
+
verbose : bool
|
|
81
|
+
Print progress information.
|
|
82
|
+
|
|
83
|
+
Returns
|
|
84
|
+
-------
|
|
85
|
+
List[int]
|
|
86
|
+
List of integer token IDs.
|
|
87
|
+
"""
|
|
88
|
+
tmp = tempfile.mkdtemp(prefix="suur_data_")
|
|
89
|
+
|
|
90
|
+
# --- Stage 1: Ingest ---
|
|
91
|
+
if verbose:
|
|
92
|
+
print(f"\n[suur_data] Stage 1 — Ingesting: {data_location}")
|
|
93
|
+
raw_text = ingest(data_location, tmp_dir=tmp)
|
|
94
|
+
|
|
95
|
+
# --- Stage 2: Filter ---
|
|
96
|
+
if no_filter or not topic.strip():
|
|
97
|
+
if verbose:
|
|
98
|
+
if no_filter:
|
|
99
|
+
print("[suur_data] Stage 2 — Filter: SKIPPED (--no-filter)")
|
|
100
|
+
else:
|
|
101
|
+
print("[suur_data] Stage 2 — Filter: SKIPPED (no topic given)")
|
|
102
|
+
filtered_text = raw_text
|
|
103
|
+
else:
|
|
104
|
+
if verbose:
|
|
105
|
+
print(f"[suur_data] Stage 2 — Neural Filter (topic: '{topic}')")
|
|
106
|
+
filtered_text, _ = filter_chunks(
|
|
107
|
+
raw_text,
|
|
108
|
+
topic=topic,
|
|
109
|
+
threshold=threshold,
|
|
110
|
+
verbose=verbose,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# --- Stage 3: Tokenize ---
|
|
114
|
+
if verbose:
|
|
115
|
+
print(f"[suur_data] Stage 3 — Tokenizing ({tokenizer} mode)")
|
|
116
|
+
|
|
117
|
+
if tokenizer == "custom":
|
|
118
|
+
tokens = tokenize_custom(filtered_text, vocab_size=vocab_size, save_dir=save_dir)
|
|
119
|
+
else:
|
|
120
|
+
tokens = tokenize_pretrained(filtered_text, model_name=model, save_dir=save_dir)
|
|
121
|
+
|
|
122
|
+
if verbose:
|
|
123
|
+
print(f"\n[suur_data] Done. Total tokens: {len(tokens):,}\n")
|
|
124
|
+
|
|
125
|
+
return tokens
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ---------------------------------------------------------------------------
|
|
129
|
+
# CLI
|
|
130
|
+
# ---------------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
@click.group()
|
|
133
|
+
@click.version_option("1.0.0", prog_name="suur-data")
|
|
134
|
+
def cli():
|
|
135
|
+
"""
|
|
136
|
+
\b
|
|
137
|
+
Suur Data — intelligent data ingestion and tokenization pipeline.
|
|
138
|
+
|
|
139
|
+
Fetch any text source, filter it by topic using TF-IDF relevance
|
|
140
|
+
scoring, then tokenize with a pretrained or custom BPE tokenizer.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@cli.command("fetch")
|
|
145
|
+
@click.argument("source")
|
|
146
|
+
@click.option("--topic", default="", show_default=True, help="Topic for relevance filtering.")
|
|
147
|
+
@click.option("--tokenizer", default="pretrained", show_default=True, type=click.Choice(["pretrained", "custom"]), help="Tokenizer mode.")
|
|
148
|
+
@click.option("--model", default="gpt2", show_default=True, help="Pretrained model name (HuggingFace).")
|
|
149
|
+
@click.option("--vocab-size", default=8000, show_default=True, help="BPE vocab size for custom tokenizer.")
|
|
150
|
+
@click.option("--threshold", default=0.05, show_default=True, help="Relevance filter threshold (0.0–1.0).")
|
|
151
|
+
@click.option("--save-dir", default=None, help="Directory to save tokenizer artifacts.")
|
|
152
|
+
@click.option("--no-filter", is_flag=True, default=False, help="Skip the relevance filter.")
|
|
153
|
+
@click.option("--output", default=None, help="Save token IDs to a JSON file.")
|
|
154
|
+
@click.option("--quiet", is_flag=True, default=False, help="Suppress progress output.")
|
|
155
|
+
def fetch_cmd(source, topic, tokenizer, model, vocab_size, threshold,
|
|
156
|
+
save_dir, no_filter, output, quiet):
|
|
157
|
+
"""
|
|
158
|
+
Ingest SOURCE (URL or file path), filter by topic, and tokenize.
|
|
159
|
+
|
|
160
|
+
\b
|
|
161
|
+
Examples:
|
|
162
|
+
suur_data fetch https://en.wikipedia.org/wiki/Neuroscience --topic "brain"
|
|
163
|
+
suur_data fetch corpus.pdf --topic "machine learning" --tokenizer custom
|
|
164
|
+
suur_data fetch data.txt --no-filter --tokenizer pretrained --model bert
|
|
165
|
+
"""
|
|
166
|
+
tokens = suur_data(
|
|
167
|
+
data_location=source,
|
|
168
|
+
topic=topic,
|
|
169
|
+
tokenizer=tokenizer,
|
|
170
|
+
model=model,
|
|
171
|
+
vocab_size=vocab_size,
|
|
172
|
+
threshold=threshold,
|
|
173
|
+
save_dir=save_dir,
|
|
174
|
+
no_filter=no_filter,
|
|
175
|
+
verbose=not quiet,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if output:
|
|
179
|
+
with open(output, "w") as f:
|
|
180
|
+
json.dump(tokens, f)
|
|
181
|
+
click.echo(f"Tokens saved to {output}")
|
|
182
|
+
else:
|
|
183
|
+
click.echo(f"Token count: {len(tokens):,}")
|
|
184
|
+
click.echo(f"First 50 tokens: {tokens[:50]}")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@cli.command("models")
|
|
188
|
+
def models_cmd():
|
|
189
|
+
"""List supported pretrained model shortcuts."""
|
|
190
|
+
rows = [
|
|
191
|
+
("gpt2", "GPT-2 (OpenAI)"),
|
|
192
|
+
("bert", "BERT base uncased"),
|
|
193
|
+
("roberta", "RoBERTa base"),
|
|
194
|
+
("distilbert", "DistilBERT base uncased"),
|
|
195
|
+
("t5", "T5 small"),
|
|
196
|
+
]
|
|
197
|
+
click.echo("\nSupported pretrained model shortcuts:\n")
|
|
198
|
+
for key, name in rows:
|
|
199
|
+
click.echo(f" {key:<14} {name}")
|
|
200
|
+
click.echo("\nYou can also pass any HuggingFace model ID directly.")
|
|
201
|
+
click.echo('Example: --model "facebook/opt-125m"\n')
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
@cli.command("formats")
|
|
205
|
+
def formats_cmd():
|
|
206
|
+
"""List supported input file formats."""
|
|
207
|
+
fmts = [
|
|
208
|
+
(".txt / .md / .rst", "Plain text, Markdown, reStructuredText"),
|
|
209
|
+
(".pdf", "PDF documents (requires pdfminer.six)"),
|
|
210
|
+
(".docx", "Word documents (requires python-docx)"),
|
|
211
|
+
(".csv / .tsv", "Comma/tab-separated values"),
|
|
212
|
+
(".json", "JSON — recursively flattens key-value pairs"),
|
|
213
|
+
(".html / .htm", "HTML pages (requires beautifulsoup4)"),
|
|
214
|
+
(".epub", "E-books (requires ebooklib + beautifulsoup4)"),
|
|
215
|
+
("URL", "Any HTTP/HTTPS URL — auto-downloaded"),
|
|
216
|
+
]
|
|
217
|
+
click.echo("\nSupported input formats:\n")
|
|
218
|
+
for ext, desc in fmts:
|
|
219
|
+
click.echo(f" {ext:<22} {desc}")
|
|
220
|
+
click.echo()
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# ---------------------------------------------------------------------------
|
|
224
|
+
# Allow running as: python -m suur_data
|
|
225
|
+
# ---------------------------------------------------------------------------
|
|
226
|
+
|
|
227
|
+
def main():
|
|
228
|
+
cli()
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
if __name__ == "__main__":
|
|
232
|
+
main()
|
|
233
|
+
|