itertoolkit 1.4.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- itertoolkit-1.4.9/.gitignore +225 -0
- itertoolkit-1.4.9/PKG-INFO +120 -0
- itertoolkit-1.4.9/README.md +104 -0
- itertoolkit-1.4.9/__init__.py +0 -0
- itertoolkit-1.4.9/bm_preprocessing/__init__.py +14 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/DM/__init__.py +7 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/DM/agg.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/DM/dbscan.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/DM/finals.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/DM/gsp.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/DM/test.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/Finals/__init__.py +7 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/Finals/kaadhal.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/Finals/raaka.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/Finals/seedan.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/Finals/vikram.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/IR/__init__.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/IR/finals.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/IR/pagerank.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/IR/test.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/PY/__init__.py +4 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/PY/lib_doc.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/PY/python_doc.py +6 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/__init__.py +8 -0
- itertoolkit-1.4.9/bm_preprocessing/importer/_module_printer.py +23 -0
- itertoolkit-1.4.9/bm_preprocessing/src/DM/__init__.py +1 -0
- itertoolkit-1.4.9/bm_preprocessing/src/DM/agg.py +267 -0
- itertoolkit-1.4.9/bm_preprocessing/src/DM/dbscan.py +218 -0
- itertoolkit-1.4.9/bm_preprocessing/src/DM/finals.py +19 -0
- itertoolkit-1.4.9/bm_preprocessing/src/DM/gsp.py +378 -0
- itertoolkit-1.4.9/bm_preprocessing/src/DM/test.py +19 -0
- itertoolkit-1.4.9/bm_preprocessing/src/Finals/__init__.py +1 -0
- itertoolkit-1.4.9/bm_preprocessing/src/Finals/kaadhal.py +1453 -0
- itertoolkit-1.4.9/bm_preprocessing/src/Finals/raaka.py +1338 -0
- itertoolkit-1.4.9/bm_preprocessing/src/Finals/seedan.py +1173 -0
- itertoolkit-1.4.9/bm_preprocessing/src/Finals/vikram.py +520 -0
- itertoolkit-1.4.9/bm_preprocessing/src/IR/__init__.py +1 -0
- itertoolkit-1.4.9/bm_preprocessing/src/IR/finals.py +14 -0
- itertoolkit-1.4.9/bm_preprocessing/src/IR/pagerank.py +109 -0
- itertoolkit-1.4.9/bm_preprocessing/src/IR/recommenders_pca.py +487 -0
- itertoolkit-1.4.9/bm_preprocessing/src/IR/test.py +14 -0
- itertoolkit-1.4.9/bm_preprocessing/src/PY/__init__.py +1 -0
- itertoolkit-1.4.9/bm_preprocessing/src/PY/lib_doc.py +295 -0
- itertoolkit-1.4.9/bm_preprocessing/src/PY/python_doc.py +177 -0
- itertoolkit-1.4.9/bm_preprocessing/src/__init__.py +1 -0
- itertoolkit-1.4.9/main.py +6 -0
- itertoolkit-1.4.9/pyproject.toml +25 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
# poetry.lock
|
|
109
|
+
# poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
# pdm.lock
|
|
116
|
+
# pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
# pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# Redis
|
|
135
|
+
*.rdb
|
|
136
|
+
*.aof
|
|
137
|
+
*.pid
|
|
138
|
+
|
|
139
|
+
# RabbitMQ
|
|
140
|
+
mnesia/
|
|
141
|
+
rabbitmq/
|
|
142
|
+
rabbitmq-data/
|
|
143
|
+
|
|
144
|
+
# ActiveMQ
|
|
145
|
+
activemq-data/
|
|
146
|
+
|
|
147
|
+
# SageMath parsed files
|
|
148
|
+
*.sage.py
|
|
149
|
+
|
|
150
|
+
# Environments
|
|
151
|
+
.env
|
|
152
|
+
.envrc
|
|
153
|
+
.venv
|
|
154
|
+
env/
|
|
155
|
+
venv/
|
|
156
|
+
ENV/
|
|
157
|
+
env.bak/
|
|
158
|
+
venv.bak/
|
|
159
|
+
|
|
160
|
+
# Spyder project settings
|
|
161
|
+
.spyderproject
|
|
162
|
+
.spyproject
|
|
163
|
+
|
|
164
|
+
# Rope project settings
|
|
165
|
+
.ropeproject
|
|
166
|
+
|
|
167
|
+
# mkdocs documentation
|
|
168
|
+
/site
|
|
169
|
+
|
|
170
|
+
# mypy
|
|
171
|
+
.mypy_cache/
|
|
172
|
+
.dmypy.json
|
|
173
|
+
dmypy.json
|
|
174
|
+
|
|
175
|
+
# Pyre type checker
|
|
176
|
+
.pyre/
|
|
177
|
+
|
|
178
|
+
# pytype static type analyzer
|
|
179
|
+
.pytype/
|
|
180
|
+
|
|
181
|
+
# Cython debug symbols
|
|
182
|
+
cython_debug/
|
|
183
|
+
|
|
184
|
+
# PyCharm
|
|
185
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
186
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
188
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
189
|
+
.idea/
|
|
190
|
+
|
|
191
|
+
# Abstra
|
|
192
|
+
# Abstra is an AI-powered process automation framework.
|
|
193
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
194
|
+
# Learn more at https://abstra.io/docs
|
|
195
|
+
.abstra/
|
|
196
|
+
|
|
197
|
+
# Visual Studio Code
|
|
198
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
199
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
200
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
201
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
202
|
+
.vscode/
|
|
203
|
+
|
|
204
|
+
# Ruff stuff:
|
|
205
|
+
.ruff_cache/
|
|
206
|
+
|
|
207
|
+
# PyPI configuration file
|
|
208
|
+
.pypirc
|
|
209
|
+
|
|
210
|
+
# Marimo
|
|
211
|
+
marimo/_static/
|
|
212
|
+
marimo/_lsp/
|
|
213
|
+
__marimo__/
|
|
214
|
+
|
|
215
|
+
# Streamlit
|
|
216
|
+
.streamlit/secrets.toml
|
|
217
|
+
|
|
218
|
+
*.png
|
|
219
|
+
*.jpg
|
|
220
|
+
*.pdf
|
|
221
|
+
*.jpeg
|
|
222
|
+
|
|
223
|
+
CMDS.md
|
|
224
|
+
|
|
225
|
+
*.code-workspace
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: itertoolkit
|
|
3
|
+
Version: 1.4.9
|
|
4
|
+
Summary: An itertools-inspired toolkit for cached iterator and data-structure processing
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: gsppy>=5.3.0
|
|
7
|
+
Requires-Dist: matplotlib>=3.10.8
|
|
8
|
+
Requires-Dist: networkx>=3.6.1
|
|
9
|
+
Requires-Dist: numpy>=2.4.4
|
|
10
|
+
Requires-Dist: pandas>=3.0.2
|
|
11
|
+
Requires-Dist: plotly>=6.6.0
|
|
12
|
+
Requires-Dist: scikit-learn>=1.8.0
|
|
13
|
+
Requires-Dist: scipy>=1.17.1
|
|
14
|
+
Requires-Dist: seaborn>=0.13.2
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# itertoolkit
|
|
18
|
+
|
|
19
|
+
Functions creating iterators and cached data pipelines for efficient looping.
|
|
20
|
+
|
|
21
|
+
`itertoolkit` is an `itertools`-inspired wrapper focused on practical data processing. It keeps the lazy, composable style of iterator algebra, then adds cache-aware helpers so repeated list and data-structure transformations run faster.
|
|
22
|
+
|
|
23
|
+
The goal is simple:
|
|
24
|
+
|
|
25
|
+
- Keep memory usage low with lazy iterators.
|
|
26
|
+
- Speed up repeated workloads with caching.
|
|
27
|
+
- Make iterator pipelines readable and reusable.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install itertoolkit
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from itertools import count, islice
|
|
39
|
+
|
|
40
|
+
# Install name: itertoolkit
|
|
41
|
+
# Current import path in this repo remains bm_preprocessing
|
|
42
|
+
from bm_preprocessing import IR, DM
|
|
43
|
+
|
|
44
|
+
# Example: base itertools stream
|
|
45
|
+
stream = (x * x for x in count(1))
|
|
46
|
+
print(list(islice(stream, 5))) # [1, 4, 9, 16, 25]
|
|
47
|
+
|
|
48
|
+
# Example: cached computation workflow (concept)
|
|
49
|
+
# result = itertoolkit.cached_map(expensive_fn, dataset, cache_key="v1")
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Why It Is Faster
|
|
53
|
+
|
|
54
|
+
`itertoolkit` performance comes from combining:
|
|
55
|
+
|
|
56
|
+
- Lazy iteration, so intermediate materialization is avoided.
|
|
57
|
+
- Cache-first wrappers, so repeated transformations are reused.
|
|
58
|
+
- Composable pipelines, so complex loops stay compact and optimized.
|
|
59
|
+
|
|
60
|
+
In repeated analytics or feature-building jobs, the first pass computes and stores results, and later passes can fetch from cache instead of recomputing every step.
|
|
61
|
+
|
|
62
|
+
## Core Iterator Families
|
|
63
|
+
|
|
64
|
+
### General iterators
|
|
65
|
+
|
|
66
|
+
| Iterator concept | Input | Output shape | Typical use |
|
|
67
|
+
| --- | --- | --- | --- |
|
|
68
|
+
| Running reduction | iterable, func | incremental totals | rolling stats |
|
|
69
|
+
| Batching | iterable, n | tuples of size n | chunk processing |
|
|
70
|
+
| Chaining | multiple iterables | one continuous stream | merging sources |
|
|
71
|
+
| Selection | data + selectors | filtered stream | mask-based filtering |
|
|
72
|
+
| Windowing | iterable | adjacent pairs/windows | transition analysis |
|
|
73
|
+
| Truncation | predicate/slice | bounded output | safe handling of infinite streams |
|
|
74
|
+
|
|
75
|
+
### Combinatoric iterators
|
|
76
|
+
|
|
77
|
+
| Iterator concept | Output |
|
|
78
|
+
| --- | --- |
|
|
79
|
+
| Cartesian products | all pairings across inputs |
|
|
80
|
+
| Permutations | order-sensitive tuples |
|
|
81
|
+
| Combinations | order-insensitive unique tuples |
|
|
82
|
+
| Combinations with replacement | tuples allowing repeated values |
|
|
83
|
+
|
|
84
|
+
## Pipeline Pattern
|
|
85
|
+
|
|
86
|
+
Use this pattern when processing large lists, tables, graphs, or text records:
|
|
87
|
+
|
|
88
|
+
1. Start from one or more iterables.
|
|
89
|
+
2. Chain filtering, mapping, grouping, and batching.
|
|
90
|
+
3. Add cache boundaries around expensive stages.
|
|
91
|
+
4. Materialize only where needed (`list`, `tuple`, `DataFrame`, model input).
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from itertools import chain
|
|
95
|
+
|
|
96
|
+
sources = [[1, 2, 3], [4, 5], [6]]
|
|
97
|
+
pipeline = (x * 10 for x in chain.from_iterable(sources) if x % 2 == 0)
|
|
98
|
+
print(list(pipeline)) # [20, 40, 60]
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Caching Strategy
|
|
102
|
+
|
|
103
|
+
Recommended caching behavior for data-heavy workloads:
|
|
104
|
+
|
|
105
|
+
- Key by transformation signature and input fingerprint.
|
|
106
|
+
- Keep deterministic steps cacheable.
|
|
107
|
+
- Invalidate cache on function/version changes.
|
|
108
|
+
- Persist long-running results between sessions.
|
|
109
|
+
|
|
110
|
+
This makes repeated preprocessing and feature extraction significantly cheaper.
|
|
111
|
+
|
|
112
|
+
## Compatibility Note
|
|
113
|
+
|
|
114
|
+
Package distribution name is `itertoolkit`.
|
|
115
|
+
|
|
116
|
+
Current code in this repository still exposes the import path `bm_preprocessing` for compatibility with existing users. If needed, a follow-up release can add a top-level `itertoolkit` import alias as well.
|
|
117
|
+
|
|
118
|
+
## License
|
|
119
|
+
|
|
120
|
+
MIT
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# itertoolkit
|
|
2
|
+
|
|
3
|
+
Functions creating iterators and cached data pipelines for efficient looping.
|
|
4
|
+
|
|
5
|
+
`itertoolkit` is an `itertools`-inspired wrapper focused on practical data processing. It keeps the lazy, composable style of iterator algebra, then adds cache-aware helpers so repeated list and data-structure transformations run faster.
|
|
6
|
+
|
|
7
|
+
The goal is simple:
|
|
8
|
+
|
|
9
|
+
- Keep memory usage low with lazy iterators.
|
|
10
|
+
- Speed up repeated workloads with caching.
|
|
11
|
+
- Make iterator pipelines readable and reusable.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install itertoolkit
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Quick Start
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from itertools import count, islice
|
|
23
|
+
|
|
24
|
+
# Install name: itertoolkit
|
|
25
|
+
# Current import path in this repo remains bm_preprocessing
|
|
26
|
+
from bm_preprocessing import IR, DM
|
|
27
|
+
|
|
28
|
+
# Example: base itertools stream
|
|
29
|
+
stream = (x * x for x in count(1))
|
|
30
|
+
print(list(islice(stream, 5))) # [1, 4, 9, 16, 25]
|
|
31
|
+
|
|
32
|
+
# Example: cached computation workflow (concept)
|
|
33
|
+
# result = itertoolkit.cached_map(expensive_fn, dataset, cache_key="v1")
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Why It Is Faster
|
|
37
|
+
|
|
38
|
+
`itertoolkit` performance comes from combining:
|
|
39
|
+
|
|
40
|
+
- Lazy iteration, so intermediate materialization is avoided.
|
|
41
|
+
- Cache-first wrappers, so repeated transformations are reused.
|
|
42
|
+
- Composable pipelines, so complex loops stay compact and optimized.
|
|
43
|
+
|
|
44
|
+
In repeated analytics or feature-building jobs, the first pass computes and stores results, and later passes can fetch from cache instead of recomputing every step.
|
|
45
|
+
|
|
46
|
+
## Core Iterator Families
|
|
47
|
+
|
|
48
|
+
### General iterators
|
|
49
|
+
|
|
50
|
+
| Iterator concept | Input | Output shape | Typical use |
|
|
51
|
+
| --- | --- | --- | --- |
|
|
52
|
+
| Running reduction | iterable, func | incremental totals | rolling stats |
|
|
53
|
+
| Batching | iterable, n | tuples of size n | chunk processing |
|
|
54
|
+
| Chaining | multiple iterables | one continuous stream | merging sources |
|
|
55
|
+
| Selection | data + selectors | filtered stream | mask-based filtering |
|
|
56
|
+
| Windowing | iterable | adjacent pairs/windows | transition analysis |
|
|
57
|
+
| Truncation | predicate/slice | bounded output | safe handling of infinite streams |
|
|
58
|
+
|
|
59
|
+
### Combinatoric iterators
|
|
60
|
+
|
|
61
|
+
| Iterator concept | Output |
|
|
62
|
+
| --- | --- |
|
|
63
|
+
| Cartesian products | all pairings across inputs |
|
|
64
|
+
| Permutations | order-sensitive tuples |
|
|
65
|
+
| Combinations | order-insensitive unique tuples |
|
|
66
|
+
| Combinations with replacement | tuples allowing repeated values |
|
|
67
|
+
|
|
68
|
+
## Pipeline Pattern
|
|
69
|
+
|
|
70
|
+
Use this pattern when processing large lists, tables, graphs, or text records:
|
|
71
|
+
|
|
72
|
+
1. Start from one or more iterables.
|
|
73
|
+
2. Chain filtering, mapping, grouping, and batching.
|
|
74
|
+
3. Add cache boundaries around expensive stages.
|
|
75
|
+
4. Materialize only where needed (`list`, `tuple`, `DataFrame`, model input).
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from itertools import chain
|
|
79
|
+
|
|
80
|
+
sources = [[1, 2, 3], [4, 5], [6]]
|
|
81
|
+
pipeline = (x * 10 for x in chain.from_iterable(sources) if x % 2 == 0)
|
|
82
|
+
print(list(pipeline)) # [20, 40, 60]
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Caching Strategy
|
|
86
|
+
|
|
87
|
+
Recommended caching behavior for data-heavy workloads:
|
|
88
|
+
|
|
89
|
+
- Key by transformation signature and input fingerprint.
|
|
90
|
+
- Keep deterministic steps cacheable.
|
|
91
|
+
- Invalidate cache on function/version changes.
|
|
92
|
+
- Persist long-running results between sessions.
|
|
93
|
+
|
|
94
|
+
This makes repeated preprocessing and feature extraction significantly cheaper.
|
|
95
|
+
|
|
96
|
+
## Compatibility Note
|
|
97
|
+
|
|
98
|
+
Package distribution name is `itertoolkit`.
|
|
99
|
+
|
|
100
|
+
Current code in this repository still exposes the import path `bm_preprocessing` for compatibility with existing users. If needed, a follow-up release can add a top-level `itertoolkit` import alias as well.
|
|
101
|
+
|
|
102
|
+
## License
|
|
103
|
+
|
|
104
|
+
MIT
|
|
File without changes
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""bm_preprocessing package."""
|
|
2
|
+
|
|
3
|
+
# Keep legacy module paths working, e.g. ``import bm_preprocessing.DM``.
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from .importer import DM, IR, PY, Finals
|
|
7
|
+
|
|
8
|
+
sys.modules[__name__ + ".DM"] = DM
|
|
9
|
+
sys.modules[__name__ + ".PY"] = PY
|
|
10
|
+
sys.modules[__name__ + ".IR"] = IR
|
|
11
|
+
sys.modules[__name__ + ".Finals"] = Finals
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
__all__ = ["DM", "PY", "IR"]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SourceCodeModule:
|
|
5
|
+
"""A class that displays source code when printed."""
|
|
6
|
+
|
|
7
|
+
def __init__(self, name: str, source_path: Path):
|
|
8
|
+
self.name = name
|
|
9
|
+
self._source_path = source_path
|
|
10
|
+
self._source_code = None
|
|
11
|
+
|
|
12
|
+
@property
|
|
13
|
+
def source_code(self) -> str:
|
|
14
|
+
"""Lazily load source code."""
|
|
15
|
+
if self._source_code is None:
|
|
16
|
+
self._source_code = self._source_path.read_text(encoding="utf-8")
|
|
17
|
+
return self._source_code
|
|
18
|
+
|
|
19
|
+
def __repr__(self) -> str:
|
|
20
|
+
return self.source_code
|
|
21
|
+
|
|
22
|
+
def __str__(self) -> str:
|
|
23
|
+
return self.source_code
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Data-mining source snippets."""
|