PyPI - tab2seq - Versions diffs - 0.1.1__tar.gz - Mend

tab2seq 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

tab2seq-0.1.1/LICENSE +21 -0
tab2seq-0.1.1/PKG-INFO +246 -0
tab2seq-0.1.1/README.md +196 -0
tab2seq-0.1.1/pyproject.toml +95 -0
tab2seq-0.1.1/setup.cfg +4 -0
tab2seq-0.1.1/src/tab2seq/__init__.py +9 -0
tab2seq-0.1.1/src/tab2seq/datasets/__init__.py +5 -0
tab2seq-0.1.1/src/tab2seq/datasets/synthetic.py +602 -0
tab2seq-0.1.1/src/tab2seq/source/__init__.py +12 -0
tab2seq-0.1.1/src/tab2seq/source/collection.py +165 -0
tab2seq-0.1.1/src/tab2seq/source/config.py +101 -0
tab2seq-0.1.1/src/tab2seq/source/core.py +223 -0
tab2seq-0.1.1/src/tab2seq.egg-info/PKG-INFO +246 -0
tab2seq-0.1.1/src/tab2seq.egg-info/SOURCES.txt +17 -0
tab2seq-0.1.1/src/tab2seq.egg-info/dependency_links.txt +1 -0
tab2seq-0.1.1/src/tab2seq.egg-info/requires.txt +29 -0
tab2seq-0.1.1/src/tab2seq.egg-info/top_level.txt +1 -0
tab2seq-0.1.1/tests/test_datasets.py +352 -0
tab2seq-0.1.1/tests/test_source.py +434 -0

tab2seq-0.1.1/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Germans Savcisens
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

tab2seq-0.1.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,246 @@
+Metadata-Version: 2.4
+Name: tab2seq
+Version: 0.1.1
+Summary: Transform tabular event data into sequences ready for Transformer and Sequential models: Life2Vec, BEHRT and more.
+Author-email: Germans Savcisens <germans@savcisens.com>
+License: MIT
+Project-URL: Homepage, https://github.com/carlomarxdk/tab2seq
+Project-URL: Documentation, https://tab2seq.readthedocs.io
+Project-URL: Repository, https://github.com/carlomarxdk/tab2seq
+Project-URL: Issues, https://github.com/carlomarxdk/tab2seq/issues
+Keywords: tokenization,data preprocessing,tabular data,transformer models,sequential models,life2vec
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy>=2.0.0
+Requires-Dist: polars<2.0,>=1.38.0
+Requires-Dist: pyarrow>=12.0.0
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: tqdm>=4.65.0
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: click>=8.1.0
+Requires-Dist: joblib>=1.3.0
+Provides-Extra: dev
+Requires-Dist: pytest>=9.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
+Requires-Dist: pytest-xdist>=3.5.0; extra == "dev"
+Requires-Dist: ruff>=0.15.0; extra == "dev"
+Requires-Dist: mypy>=1.19.0; extra == "dev"
+Requires-Dist: types-PyYAML>=6.0.0; extra == "dev"
+Provides-Extra: docs
+Requires-Dist: mkdocs>=1.6.1; extra == "docs"
+Requires-Dist: mkdocs-material>=9.7.1; extra == "docs"
+Requires-Dist: mkdocstrings>=1.0.2; extra == "docs"
+Requires-Dist: mkdocstrings-python>=2.0.0; extra == "docs"
+Requires-Dist: mkdocs-gen-files>=0.6.0; extra == "docs"
+Requires-Dist: mkdocs-literate-nav>=0.6.2; extra == "docs"
+Requires-Dist: mkdocs-section-index>=0.3.10; extra == "docs"
+Requires-Dist: mkdocs-bibtex>=4.4.0; extra == "docs"
+Provides-Extra: all
+Requires-Dist: tab2seq[dev,docs]; extra == "all"
+Dynamic: license-file
+# tab2seq
+[![PyPI - Version](https://img.shields.io/pypi/v/tab2seq)](https://pypi.org/project/tab2seq/)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/tab2seq)](https://pypi.org/project/tab2seq/)
+[![PyPI - Status](https://img.shields.io/pypi/status/tab2seq)](https://pypi.org/project/tab2seq/)
+[![GitHub License](https://img.shields.io/github/license/carlomarxdk/tab2seq)](https://github.com/carlomarxdk/tab2seq/blob/main/LICENSE)
+**tab2seq** adapts the Life2Vec data processing pipeline to make it easy to work with multi-source tabular event data for sequential modeling projects. Transform registry data, EHR records, and other event-based datasets into formats ready for Transformer and sequential deep learning models.
+> [!WARNING]
+> This is an alpha package. In the beta version, it will reimplement all the data-preprocessing steps of the [life2vec](https://github.com/SocialComplexityLab/life2vec) and [life2vec-light](https://github.com/carlomarxdk/life2vec-light) repos. See [TODOs](#todos) to see what is implemented at this point.
+## About
+This package extracts and generalizes the data processing patterns from the [Life2Vec](https://github.com/SocialComplexityLab/life2vec) project, making them reusable for similar research projects that need to:
+- Work with multiple longitudinal data sources (registries, databases)
+- Define and filter cohorts based on complex criteria
+- Generate realistic synthetic data for development and testing
+- Process large-scale tabular event data efficiently
+Whether you're working with healthcare data, financial records, or any time-stamped event data, tab2seq provides the building blocks for preparing data for Life2Vec-style sequential models.
+## Features
+- **Multi-Source Data Management**: Handle multiple data sources (registries) with unified schema
+- **Type-Safe Configuration**: Pydantic-based configuration with YAML support
+- **Synthetic Data Generation**: Generate realistic dummy registry data for testing and exploration
+- **Memory-Efficient Loading**: Chunked iteration and lazy loading with Polars
+- **Schema Validation**: Automatic validation of entity IDs, timestamps, and column types
+- **Cross-Source Operations**: Unified access and operations across multiple data sources
+## Installation
+```bash
+# Basic installation
+pip install tab2seq
+# Development installation
+pip install -e ".[dev]"
+```
+## Quick Start
+### Working with Multiple Data Sources
+```python
+from tab2seq.source import Source, SourceCollection, SourceConfig
+# Define your data sources
+configs = [
+    SourceConfig(
+        name="health",
+        filepath="data/health.parquet",
+        entity_id_col="patient_id",
+        timestamp_cols=["date"],
+        categorical_cols=["diagnosis", "procedure", "department"],
+        continuous_cols=["cost", "length_of_stay"],
+    ),
+    SourceConfig(
+        name="income",
+        filepath="data/income.parquet",
+        entity_id_col="person_id",
+        timestamp_cols=["year"],
+        categorical_cols=["income_type", "sector"],
+        continuous_cols=["income_amount"],
+    ),
+]
+# Create a source collection
+collection = SourceCollection.from_configs(configs)
+# Access individual sources
+health = collection["health"]
+df = health.read_all()
+# Or iterate over all sources
+for source in collection:
+    print(f"{source.name}: {len(source.get_entity_ids())} entities")
+# Cross-source operations
+all_entity_ids = collection.get_all_entity_ids()
+```
+### Generating Synthetic Data
+```python
+from tab2seq.datasets import generate_synthetic_collections
+# Generate synthetic registry data for testing
+collection = generate_synthetic_collections(
+    output_dir="data/dummy",
+    n_entities=1000,
+    seed=42
+)
+# Returns a ready-to-use SourceCollection
+health = collection["health"]
+print(health.read_all().head())
+```
+## Architecture
+> [!warning]
+> Work in progress!
+**Available Registries:**
+- **health**: Medical events with diagnoses (ICD codes), procedures, departments, costs, and length of stay
+- **income**: Yearly income records with income type, sector, and amounts
+- **labour**: Quarterly labour status with occupation, employment status, and residence
+- **survey**: Periodic survey responses with education level, marital status, and satisfaction scores
+All synthetic data includes realistic temporal patterns, missing data, and correlations between fields to mimic real-world registry data.
+## Use Cases
+- **Healthcare Research**: Transform electronic health records (EHR) into sequences for predictive modeling
+- **Registry Data Processing**: Work with multiple event-based registries (health, income, labour, surveys)
+- **Sequential Modeling**: Prepare multi-source data for Life2Vec, BEHRT, or other transformer-based models
+- **Data Pipeline Development**: Use synthetic data to develop and test processing pipelines before working with sensitive real data
+- **Multi-Source Analysis**: Combine and analyze data from multiple longitudinal sources with unified tooling
+## Development
+```bash
+# Install development dependencies
+pip install -e ".[dev]"
+# Run tests
+pytest
+# Run tests with coverage
+pytest --cov=tab2seq --cov-report=html
+# Format code
+black src/tab2seq tests
+# Lint code
+ruff check src/tab2seq tests
+```
+## TODOs
+- [x] Synthetic Datasets
+- [x] `Source` implementation
+- [ ] `Cohort` implementation
+- [ ] `Cohort` and data splits
+- [ ] `Tokenization` implementation
+- [ ] `Vocabulary` implementation
+- [ ] Caching and chunking
+## Citation
+If you use this package in your research, please cite:
+```bibtex
+@software{tab2seq2024,
+  author = {Savcisens, Germans},
+  title = {tab2seq: Scalable Tabular to Sequential Data Processing},
+  year = {2024},
+  url = {https://github.com/carlomarxdk/tab2seq}
+}
+```
+And the original Life2Vec paper that inspired this work:
+```bibtex
+@article{savcisens2024using,
+  title={Using sequences of life-events to predict human lives},
+  author={Savcisens, Germans and Eliassi-Rad, Tina and Hansen, Lars Kai and Mortensen, Laust Hvas and Lilleholt, Lau and Rogers, Anna and Zettler, Ingo and Lehmann, Sune},
+  journal={Nature computational science},
+  volume={4},
+  number={1},
+  pages={43--56},
+  year={2024},
+  publisher={Nature Publishing Group US New York}
+}
+```
+## Acknowledgments
+- Inspired by the data processing pipeline from [Life2Vec](https://github.com/SocialComplexityLab/life2vec) and [Life2Vec-Light](https://github.com/SocialComplexityLab/life2vec-light)
+- Built with [Polars](https://polars.rs/), [PyArrow](https://arrow.apache.org/docs/python/), [Pydantic](https://pydantic.dev/), and [Joblib](https://joblib.readthedocs.io/)
+## Contributing
+Contributions are welcome! Please open an issue or submit a pull request on [GitHub](https://github.com/carlomarxdk/tab2seq).
+## License
+MIT License - see [LICENSE](LICENSE) file for details.
+## Support
+- 🐛 Issues: [GitHub Issues](https://github.com/carlomarxdk/tab2seq/issues)
+- 💬 Discussions: [GitHub Discussions](https://github.com/carlomarxdk/tab2seq/discussions)

tab2seq-0.1.1/README.md ADDED Viewed

@@ -0,0 +1,196 @@
+# tab2seq
+[![PyPI - Version](https://img.shields.io/pypi/v/tab2seq)](https://pypi.org/project/tab2seq/)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/tab2seq)](https://pypi.org/project/tab2seq/)
+[![PyPI - Status](https://img.shields.io/pypi/status/tab2seq)](https://pypi.org/project/tab2seq/)
+[![GitHub License](https://img.shields.io/github/license/carlomarxdk/tab2seq)](https://github.com/carlomarxdk/tab2seq/blob/main/LICENSE)
+**tab2seq** adapts the Life2Vec data processing pipeline to make it easy to work with multi-source tabular event data for sequential modeling projects. Transform registry data, EHR records, and other event-based datasets into formats ready for Transformer and sequential deep learning models.
+> [!WARNING]
+> This is an alpha package. In the beta version, it will reimplement all the data-preprocessing steps of the [life2vec](https://github.com/SocialComplexityLab/life2vec) and [life2vec-light](https://github.com/carlomarxdk/life2vec-light) repos. See [TODOs](#todos) to see what is implemented at this point.
+## About
+This package extracts and generalizes the data processing patterns from the [Life2Vec](https://github.com/SocialComplexityLab/life2vec) project, making them reusable for similar research projects that need to:
+- Work with multiple longitudinal data sources (registries, databases)
+- Define and filter cohorts based on complex criteria
+- Generate realistic synthetic data for development and testing
+- Process large-scale tabular event data efficiently
+Whether you're working with healthcare data, financial records, or any time-stamped event data, tab2seq provides the building blocks for preparing data for Life2Vec-style sequential models.
+## Features
+- **Multi-Source Data Management**: Handle multiple data sources (registries) with unified schema
+- **Type-Safe Configuration**: Pydantic-based configuration with YAML support
+- **Synthetic Data Generation**: Generate realistic dummy registry data for testing and exploration
+- **Memory-Efficient Loading**: Chunked iteration and lazy loading with Polars
+- **Schema Validation**: Automatic validation of entity IDs, timestamps, and column types
+- **Cross-Source Operations**: Unified access and operations across multiple data sources
+## Installation
+```bash
+# Basic installation
+pip install tab2seq
+# Development installation
+pip install -e ".[dev]"
+```
+## Quick Start
+### Working with Multiple Data Sources
+```python
+from tab2seq.source import Source, SourceCollection, SourceConfig
+# Define your data sources
+configs = [
+    SourceConfig(
+        name="health",
+        filepath="data/health.parquet",
+        entity_id_col="patient_id",
+        timestamp_cols=["date"],
+        categorical_cols=["diagnosis", "procedure", "department"],
+        continuous_cols=["cost", "length_of_stay"],
+    ),
+    SourceConfig(
+        name="income",
+        filepath="data/income.parquet",
+        entity_id_col="person_id",
+        timestamp_cols=["year"],
+        categorical_cols=["income_type", "sector"],
+        continuous_cols=["income_amount"],
+    ),
+]
+# Create a source collection
+collection = SourceCollection.from_configs(configs)
+# Access individual sources
+health = collection["health"]
+df = health.read_all()
+# Or iterate over all sources
+for source in collection:
+    print(f"{source.name}: {len(source.get_entity_ids())} entities")
+# Cross-source operations
+all_entity_ids = collection.get_all_entity_ids()
+```
+### Generating Synthetic Data
+```python
+from tab2seq.datasets import generate_synthetic_collections
+# Generate synthetic registry data for testing
+collection = generate_synthetic_collections(
+    output_dir="data/dummy",
+    n_entities=1000,
+    seed=42
+)
+# Returns a ready-to-use SourceCollection
+health = collection["health"]
+print(health.read_all().head())
+```
+## Architecture
+> [!warning]
+> Work in progress!
+**Available Registries:**
+- **health**: Medical events with diagnoses (ICD codes), procedures, departments, costs, and length of stay
+- **income**: Yearly income records with income type, sector, and amounts
+- **labour**: Quarterly labour status with occupation, employment status, and residence
+- **survey**: Periodic survey responses with education level, marital status, and satisfaction scores
+All synthetic data includes realistic temporal patterns, missing data, and correlations between fields to mimic real-world registry data.
+## Use Cases
+- **Healthcare Research**: Transform electronic health records (EHR) into sequences for predictive modeling
+- **Registry Data Processing**: Work with multiple event-based registries (health, income, labour, surveys)
+- **Sequential Modeling**: Prepare multi-source data for Life2Vec, BEHRT, or other transformer-based models
+- **Data Pipeline Development**: Use synthetic data to develop and test processing pipelines before working with sensitive real data
+- **Multi-Source Analysis**: Combine and analyze data from multiple longitudinal sources with unified tooling
+## Development
+```bash
+# Install development dependencies
+pip install -e ".[dev]"
+# Run tests
+pytest
+# Run tests with coverage
+pytest --cov=tab2seq --cov-report=html
+# Format code
+black src/tab2seq tests
+# Lint code
+ruff check src/tab2seq tests
+```
+## TODOs
+- [x] Synthetic Datasets
+- [x] `Source` implementation
+- [ ] `Cohort` implementation
+- [ ] `Cohort` and data splits
+- [ ] `Tokenization` implementation
+- [ ] `Vocabulary` implementation
+- [ ] Caching and chunking
+## Citation
+If you use this package in your research, please cite:
+```bibtex
+@software{tab2seq2024,
+  author = {Savcisens, Germans},
+  title = {tab2seq: Scalable Tabular to Sequential Data Processing},
+  year = {2024},
+  url = {https://github.com/carlomarxdk/tab2seq}
+}
+```
+And the original Life2Vec paper that inspired this work:
+```bibtex
+@article{savcisens2024using,
+  title={Using sequences of life-events to predict human lives},
+  author={Savcisens, Germans and Eliassi-Rad, Tina and Hansen, Lars Kai and Mortensen, Laust Hvas and Lilleholt, Lau and Rogers, Anna and Zettler, Ingo and Lehmann, Sune},
+  journal={Nature computational science},
+  volume={4},
+  number={1},
+  pages={43--56},
+  year={2024},
+  publisher={Nature Publishing Group US New York}
+}
+```
+## Acknowledgments
+- Inspired by the data processing pipeline from [Life2Vec](https://github.com/SocialComplexityLab/life2vec) and [Life2Vec-Light](https://github.com/SocialComplexityLab/life2vec-light)
+- Built with [Polars](https://polars.rs/), [PyArrow](https://arrow.apache.org/docs/python/), [Pydantic](https://pydantic.dev/), and [Joblib](https://joblib.readthedocs.io/)
+## Contributing
+Contributions are welcome! Please open an issue or submit a pull request on [GitHub](https://github.com/carlomarxdk/tab2seq).
+## License
+MIT License - see [LICENSE](LICENSE) file for details.
+## Support
+- 🐛 Issues: [GitHub Issues](https://github.com/carlomarxdk/tab2seq/issues)
+- 💬 Discussions: [GitHub Discussions](https://github.com/carlomarxdk/tab2seq/discussions)

tab2seq-0.1.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,95 @@
+[build-system]
+requires = ["setuptools>=65.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "tab2seq"
+version = "0.1.1"
+description = "Transform tabular event data into sequences ready for Transformer and Sequential models: Life2Vec, BEHRT and more."
+readme = "README.md"
+requires-python = ">=3.11"
+license = {text = "MIT"}
+authors = [
+    {name = "Germans Savcisens", email = "germans@savcisens.com"}
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+keywords = ["tokenization", "data preprocessing", "tabular data", "transformer models", "sequential models", "life2vec"]
+dependencies = [
+    "numpy>=2.0.0",
+    "polars>=1.38.0,<2.0",
+    "pyarrow>=12.0.0",
+    "pydantic>=2.0.0",
+    "tqdm>=4.65.0",
+    "pyyaml>=6.0",
+    "click>=8.1.0",
+    "joblib>=1.3.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=9.0.0",
+    "pytest-cov>=6.0.0",
+    "pytest-xdist>=3.5.0",
+    "ruff>=0.15.0",
+    "mypy>=1.19.0",
+    "types-PyYAML>=6.0.0",
+    ]
+docs = [
+    "mkdocs>=1.6.1",
+    "mkdocs-material>=9.7.1",
+    "mkdocstrings>=1.0.2",
+    "mkdocstrings-python>=2.0.0",
+    "mkdocs-gen-files>=0.6.0",
+    "mkdocs-literate-nav>=0.6.2",
+    "mkdocs-section-index>=0.3.10",
+    "mkdocs-bibtex>=4.4.0",
+]
+all = ["tab2seq[dev,docs]"]
+[project.urls]
+Homepage = "https://github.com/carlomarxdk/tab2seq"
+Documentation = "https://tab2seq.readthedocs.io"
+Repository = "https://github.com/carlomarxdk/tab2seq"
+Issues = "https://github.com/carlomarxdk/tab2seq/issues"
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["tab2seq*"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = [
+    "--strict-markers",
+    "--cov=tab2seq",
+    "--cov-report=term-missing",
+    "--cov-report=html",
+]
+[tool.ruff]
+line-length = 88
+target-version = "py311"
+[tool.ruff.lint]
+select = ["E", "F", "I", "N", "W", "D"]
+ignore = ["D203", "D213"]
+[tool.mypy]
+python_version = "3.11"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = true

tab2seq-0.1.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

tab2seq-0.1.1/src/tab2seq/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""tab2seq - Transform tabular event data into sequences for transformer models."""
+from importlib.metadata import version, PackageNotFoundError
+try:
+    __version__ = version("tab2seq")
+except PackageNotFoundError:
+    # Package not installed (e.g. running from source without pip install -e .)
+    __version__ = "unknown"

tab2seq-0.1.1/src/tab2seq/datasets/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Built-in datasets and synthetic data generation."""
+from tab2seq.datasets.synthetic import generate_synthetic_collections, generate_synthetic_data
+__all__ = ["generate_synthetic_collections", "generate_synthetic_data"]