calt-x 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- calt_x-0.1.0/PKG-INFO +168 -0
- calt_x-0.1.0/README.md +151 -0
- calt_x-0.1.0/pyproject.toml +47 -0
- calt_x-0.1.0/setup.cfg +4 -0
- calt_x-0.1.0/src/calt/__init__.py +9 -0
- calt_x-0.1.0/src/calt/data_loader/__init__.py +0 -0
- calt_x-0.1.0/src/calt/data_loader/data_loader.py +86 -0
- calt_x-0.1.0/src/calt/data_loader/utils/__init__.py +0 -0
- calt_x-0.1.0/src/calt/data_loader/utils/data_collator.py +129 -0
- calt_x-0.1.0/src/calt/data_loader/utils/preprocessor.py +313 -0
- calt_x-0.1.0/src/calt/data_loader/utils/tokenizer.py +67 -0
- calt_x-0.1.0/src/calt/generate/__init__.py +0 -0
- calt_x-0.1.0/src/calt/generate/dataset_generator.py +114 -0
- calt_x-0.1.0/src/calt/generate/utils/__init__.py +0 -0
- calt_x-0.1.0/src/calt/generate/utils/dataset_writer.py +136 -0
- calt_x-0.1.0/src/calt/generate/utils/polynomial_sampler.py +210 -0
- calt_x-0.1.0/src/calt/generate/utils/statistics_calculator.py +207 -0
- calt_x-0.1.0/src/calt/trainer/__init__.py +0 -0
- calt_x-0.1.0/src/calt/trainer/trainer.py +157 -0
- calt_x-0.1.0/src/calt/trainer/utils.py +74 -0
- calt_x-0.1.0/src/calt_x.egg-info/PKG-INFO +168 -0
- calt_x-0.1.0/src/calt_x.egg-info/SOURCES.txt +23 -0
- calt_x-0.1.0/src/calt_x.egg-info/dependency_links.txt +1 -0
- calt_x-0.1.0/src/calt_x.egg-info/requires.txt +7 -0
- calt_x-0.1.0/src/calt_x.egg-info/top_level.txt +1 -0
calt_x-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: calt-x
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A library for computational algebra using Transformers
|
|
5
|
+
Author-email: Yuta Sato <sato.yuta@gmail.com>
|
|
6
|
+
Project-URL: Source, https://github.com/HiroshiKERA/calt
|
|
7
|
+
Project-URL: Issues, https://github.com/HiroshiKERA/calt/issues
|
|
8
|
+
Requires-Python: >=3.12
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: transformers>=4.49.0
|
|
11
|
+
Requires-Dist: omegaconf>=2.3.0
|
|
12
|
+
Requires-Dist: torch>=2.6.0
|
|
13
|
+
Requires-Dist: wandb>=0.15.11
|
|
14
|
+
Requires-Dist: accelerate>=0.29.0
|
|
15
|
+
Requires-Dist: joblib>=1.5.0
|
|
16
|
+
Requires-Dist: sympy>=1.12
|
|
17
|
+
|
|
18
|
+
# CALT: Computer ALgebra with Transformer
|
|
19
|
+
This project is currently in its initial development phase. The file structure and content are subject to significant changes. Please ensure you are referring to the latest version when using it.
|
|
20
|
+
|
|
21
|
+
# Environment Setup using Docker
|
|
22
|
+
|
|
23
|
+
This guide explains how to set up the development environment using Docker.
|
|
24
|
+
|
|
25
|
+
## Prerequisites
|
|
26
|
+
|
|
27
|
+
- Docker installed on your system.
|
|
28
|
+
- NVIDIA GPU drivers installed if you plan to use GPU acceleration (`--gpus all` option).
|
|
29
|
+
|
|
30
|
+
## Build the Docker Image
|
|
31
|
+
|
|
32
|
+
To build the Docker image, navigate to the `calt` directory and run the following command:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
make build
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Alternatively, you can use the direct Docker command:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
docker build -t ta-sage .
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Run the Docker Container
|
|
45
|
+
|
|
46
|
+
To run the Docker container in detached mode with GPU support, execute:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
make run
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
The direct Docker command is:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
docker run --gpus all -d --name ta-sage-container -v "$(pwd)":/app ta-sage tail -f /dev/null
|
|
56
|
+
```
|
|
57
|
+
*Note: When running this command directly in your terminal, `$(pwd)` will resolve to your current working directory. The Makefile uses `$(CURDIR)` which serves the same purpose within the Makefile context.*
|
|
58
|
+
|
|
59
|
+
## Access the Container
|
|
60
|
+
|
|
61
|
+
Once the container is running, you can access it using:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
docker exec -it ta-sage-container bash
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Stop and Remove the Container
|
|
68
|
+
|
|
69
|
+
To stop and remove the container, you can use:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
make stop
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Or manually:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
docker stop ta-sage-container
|
|
79
|
+
docker rm ta-sage-container
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Local Setup (without Docker)
|
|
83
|
+
|
|
84
|
+
This section describes how to set up the environment locally without using Docker. This assumes you have SageMath installed on your system.
|
|
85
|
+
|
|
86
|
+
### 1. Install SageMath
|
|
87
|
+
|
|
88
|
+
You can install SageMath using `apt` on Debian/Ubuntu-based systems. It's not necessary to have the absolute latest version.
|
|
89
|
+
|
|
90
|
+
Install SageMath:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
sudo apt-get install -y sagemath
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### 2. Install Dependencies
|
|
97
|
+
|
|
98
|
+
Once SageMath is installed, you can install the required Python packages using `sage -pip`.
|
|
99
|
+
|
|
100
|
+
First, upgrade pip:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
sage -pip install --upgrade pip
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Next, install the Python dependencies:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
sage -pip install --break-system-packages \
|
|
110
|
+
"torch==2.6.0" \
|
|
111
|
+
"transformers>=4.49.0" \
|
|
112
|
+
"omegaconf>=2.3.0" \
|
|
113
|
+
"wandb>=0.15.11" \
|
|
114
|
+
"accelerate>=0.29.0" \
|
|
115
|
+
"joblib>=1.5.0"
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
**For GPU support with PyTorch:**
|
|
119
|
+
If you need GPU support, replace the `torch` installation line with the one that specifies the CUDA version compatible with your system. For example, for CUDA 12.4:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
sage -pip install --break-system-packages \
|
|
123
|
+
--extra-index-url https://download.pytorch.org/whl/cu124 \
|
|
124
|
+
"torch==2.6.0"
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### 3. Install `transformer_algebra` (Editable)
|
|
128
|
+
|
|
129
|
+
Finally, install the `transformer_algebra` package in editable mode. Navigate to the root of the `calt` project directory (where this `README.md` and the `pyproject.toml` for `transformer_algebra` are located) and run:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
sage -pip install -e .
|
|
133
|
+
```
|
|
134
|
+
This command assumes that the necessary setup files for `transformer_algebra` are in the current directory (`.`). If `transformer_algebra` is a subdirectory (e.g., `/app` as in the Dockerfile context), you would run `sage -pip install -e /path/to/transformer_algebra_directory`.
|
|
135
|
+
|
|
136
|
+
## Generating Datasets
|
|
137
|
+
|
|
138
|
+
To generate the default dataset, run the following command from the project root:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
sage scripts/generate.py
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
To generate datasets using a different `ProblemGenerator` class, you will need to modify `scripts/generate.py` by uncommenting the desired `ProblemGenerator` class and commenting out others.
|
|
145
|
+
|
|
146
|
+
## Running Training
|
|
147
|
+
|
|
148
|
+
To start training with the default configuration, execute the following command from the project root:
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
sage scripts/train.py
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Weights & Biases (wandb) Setup
|
|
155
|
+
|
|
156
|
+
If you are using Weights & Biases (wandb) for the first time to log training progress, you will need to create an account on their website and set up your API key. When you run the training script for the first time, you will be prompted to enter your API key.
|
|
157
|
+
|
|
158
|
+
https://wandb.ai/site/
|
|
159
|
+
|
|
160
|
+
### Configuration
|
|
161
|
+
|
|
162
|
+
Training parameters can be modified by editing the configuration file located at `config/train_example.yaml`.
|
|
163
|
+
|
|
164
|
+
## Demonstrations
|
|
165
|
+
|
|
166
|
+
Simple demonstrations for data generation and training are available as Jupyter Notebook files. You can find them in the `notebook` directory (please create this directory and add your notebooks if it doesn't exist yet).
|
|
167
|
+
|
|
168
|
+
To run these notebooks, you need to start SageMath's Jupyter server using the command `sage -n` and then select the SageMath kernel in the notebook interface.
|
calt_x-0.1.0/README.md
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# CALT: Computer ALgebra with Transformer
|
|
2
|
+
This project is currently in its initial development phase. The file structure and content are subject to significant changes. Please ensure you are referring to the latest version when using it.
|
|
3
|
+
|
|
4
|
+
# Environment Setup using Docker
|
|
5
|
+
|
|
6
|
+
This guide explains how to set up the development environment using Docker.
|
|
7
|
+
|
|
8
|
+
## Prerequisites
|
|
9
|
+
|
|
10
|
+
- Docker installed on your system.
|
|
11
|
+
- NVIDIA GPU drivers installed if you plan to use GPU acceleration (`--gpus all` option).
|
|
12
|
+
|
|
13
|
+
## Build the Docker Image
|
|
14
|
+
|
|
15
|
+
To build the Docker image, navigate to the `calt` directory and run the following command:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
make build
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Alternatively, you can use the direct Docker command:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
docker build -t ta-sage .
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Run the Docker Container
|
|
28
|
+
|
|
29
|
+
To run the Docker container in detached mode with GPU support, execute:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
make run
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
The direct Docker command is:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
docker run --gpus all -d --name ta-sage-container -v "$(pwd)":/app ta-sage tail -f /dev/null
|
|
39
|
+
```
|
|
40
|
+
*Note: When running this command directly in your terminal, `$(pwd)` will resolve to your current working directory. The Makefile uses `$(CURDIR)` which serves the same purpose within the Makefile context.*
|
|
41
|
+
|
|
42
|
+
## Access the Container
|
|
43
|
+
|
|
44
|
+
Once the container is running, you can access it using:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
docker exec -it ta-sage-container bash
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Stop and Remove the Container
|
|
51
|
+
|
|
52
|
+
To stop and remove the container, you can use:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
make stop
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Or manually:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
docker stop ta-sage-container
|
|
62
|
+
docker rm ta-sage-container
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Local Setup (without Docker)
|
|
66
|
+
|
|
67
|
+
This section describes how to set up the environment locally without using Docker. This assumes you have SageMath installed on your system.
|
|
68
|
+
|
|
69
|
+
### 1. Install SageMath
|
|
70
|
+
|
|
71
|
+
You can install SageMath using `apt` on Debian/Ubuntu-based systems. It's not necessary to have the absolute latest version.
|
|
72
|
+
|
|
73
|
+
Install SageMath:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
sudo apt-get install -y sagemath
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### 2. Install Dependencies
|
|
80
|
+
|
|
81
|
+
Once SageMath is installed, you can install the required Python packages using `sage -pip`.
|
|
82
|
+
|
|
83
|
+
First, upgrade pip:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
sage -pip install --upgrade pip
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Next, install the Python dependencies:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
sage -pip install --break-system-packages \
|
|
93
|
+
"torch==2.6.0" \
|
|
94
|
+
"transformers>=4.49.0" \
|
|
95
|
+
"omegaconf>=2.3.0" \
|
|
96
|
+
"wandb>=0.15.11" \
|
|
97
|
+
"accelerate>=0.29.0" \
|
|
98
|
+
"joblib>=1.5.0"
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**For GPU support with PyTorch:**
|
|
102
|
+
If you need GPU support, replace the `torch` installation line with the one that specifies the CUDA version compatible with your system. For example, for CUDA 12.4:
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
sage -pip install --break-system-packages \
|
|
106
|
+
--extra-index-url https://download.pytorch.org/whl/cu124 \
|
|
107
|
+
"torch==2.6.0"
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### 3. Install `transformer_algebra` (Editable)
|
|
111
|
+
|
|
112
|
+
Finally, install the `transformer_algebra` package in editable mode. Navigate to the root of the `calt` project directory (where this `README.md` and the `pyproject.toml` for `transformer_algebra` are located) and run:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
sage -pip install -e .
|
|
116
|
+
```
|
|
117
|
+
This command assumes that the necessary setup files for `transformer_algebra` are in the current directory (`.`). If `transformer_algebra` is a subdirectory (e.g., `/app` as in the Dockerfile context), you would run `sage -pip install -e /path/to/transformer_algebra_directory`.
|
|
118
|
+
|
|
119
|
+
## Generating Datasets
|
|
120
|
+
|
|
121
|
+
To generate the default dataset, run the following command from the project root:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
sage scripts/generate.py
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
To generate datasets using a different `ProblemGenerator` class, you will need to modify `scripts/generate.py` by uncommenting the desired `ProblemGenerator` class and commenting out others.
|
|
128
|
+
|
|
129
|
+
## Running Training
|
|
130
|
+
|
|
131
|
+
To start training with the default configuration, execute the following command from the project root:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
sage scripts/train.py
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Weights & Biases (wandb) Setup
|
|
138
|
+
|
|
139
|
+
If you are using Weights & Biases (wandb) for the first time to log training progress, you will need to create an account on their website and set up your API key. When you run the training script for the first time, you will be prompted to enter your API key.
|
|
140
|
+
|
|
141
|
+
https://wandb.ai/site/
|
|
142
|
+
|
|
143
|
+
### Configuration
|
|
144
|
+
|
|
145
|
+
Training parameters can be modified by editing the configuration file located at `config/train_example.yaml`.
|
|
146
|
+
|
|
147
|
+
## Demonstrations
|
|
148
|
+
|
|
149
|
+
Simple demonstrations for data generation and training are available as Jupyter Notebook files. You can find them in the `notebook` directory (please create this directory and add your notebooks if it doesn't exist yet).
|
|
150
|
+
|
|
151
|
+
To run these notebooks, you need to start SageMath's Jupyter server using the command `sage -n` and then select the SageMath kernel in the notebook interface.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=65", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "calt-x"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A library for computational algebra using Transformers"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "Yuta Sato", email = "sato.yuta@gmail.com"}
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
dependencies = [
|
|
16
|
+
"transformers>=4.49.0",
|
|
17
|
+
"omegaconf>=2.3.0",
|
|
18
|
+
"torch>=2.6.0",
|
|
19
|
+
"wandb>=0.15.11",
|
|
20
|
+
"accelerate>=0.29.0",
|
|
21
|
+
"joblib>=1.5.0",
|
|
22
|
+
"sympy>=1.12",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[dependency-groups]
|
|
26
|
+
dev = [
|
|
27
|
+
"mypy>=1.15.0",
|
|
28
|
+
"ruff>=0.9.9",
|
|
29
|
+
"pydantic>=2.10.6",
|
|
30
|
+
"pytest>=8.3.5",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[tool.ruff]
|
|
34
|
+
lint.per-file-ignores = {"src/calt/__init__.py" = ["F401"]}
|
|
35
|
+
exclude = [
|
|
36
|
+
"sage/**",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[tool.setuptools]
|
|
40
|
+
package-dir = {"" = "src"}
|
|
41
|
+
|
|
42
|
+
[tool.setuptools.packages.find]
|
|
43
|
+
where = ["src"]
|
|
44
|
+
|
|
45
|
+
[project.urls]
|
|
46
|
+
Source = "https://github.com/HiroshiKERA/calt"
|
|
47
|
+
Issues = "https://github.com/HiroshiKERA/calt/issues"
|
calt_x-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from .trainer.trainer import PolynomialTrainer
|
|
2
|
+
from .trainer.utils import count_cuda_devices
|
|
3
|
+
from .data_loader.data_loader import data_loader
|
|
4
|
+
from .data_loader.utils.data_collator import StandardDataset, StandardDataCollator
|
|
5
|
+
from .data_loader.utils.tokenizer import set_tokenizer
|
|
6
|
+
from .data_loader.utils.preprocessor import SymbolicToInternalProcessor, IntegerToInternalProcessor
|
|
7
|
+
# from .generate.dataset_generator import DatasetGenerator
|
|
8
|
+
# from .generate.utils.polynomial_sampler import PolynomialSampler
|
|
9
|
+
from .generate.utils.dataset_writer import DatasetWriter
|
|
File without changes
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Data loading utilities for the Transformer Algebra project.
|
|
2
|
+
|
|
3
|
+
This module defines helper functions that build the training and evaluation
|
|
4
|
+
`Dataset`, `Tokenizer`, and `DataCollator` objects used throughout the
|
|
5
|
+
library. In particular, the `data_loader` factory translates symbolic
|
|
6
|
+
polynomial expressions into the internal token representation expected by the
|
|
7
|
+
Transformer models.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .utils.data_collator import StandardDataset, StandardDataCollator
|
|
11
|
+
from .utils.preprocessor import SymbolicToInternalProcessor, IntegerToInternalProcessor
|
|
12
|
+
from .utils.tokenizer import set_tokenizer
|
|
13
|
+
from transformers import PreTrainedTokenizerFast as StandardTokenizer
|
|
14
|
+
from typing import Tuple
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def data_loader(
|
|
18
|
+
train_dataset_path: str,
|
|
19
|
+
test_dataset_path: str,
|
|
20
|
+
field: str,
|
|
21
|
+
num_variables: int,
|
|
22
|
+
max_degree: int,
|
|
23
|
+
max_coeff: int,
|
|
24
|
+
max_length: int = 512,
|
|
25
|
+
processor_name: str = "polynomial",
|
|
26
|
+
) -> Tuple[StandardDataset, StandardTokenizer, StandardDataCollator]:
|
|
27
|
+
"""Create dataset, tokenizer and data-collator objects.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
train_dataset_path : str
|
|
32
|
+
Path to the file that stores the *training* samples.
|
|
33
|
+
test_dataset_path : str
|
|
34
|
+
Path to the file that stores the *evaluation* samples.
|
|
35
|
+
field : str
|
|
36
|
+
Finite-field identifier (e.g. ``"Q"`` for the rationals or ``"Zp"``
|
|
37
|
+
for a prime field) used to generate the vocabulary.
|
|
38
|
+
num_variables : int
|
|
39
|
+
Maximum number of symbolic variables (\(x_1, \dots, x_n\)) that can
|
|
40
|
+
appear in a polynomial.
|
|
41
|
+
max_degree : int
|
|
42
|
+
Maximum total degree allowed for any monomial term.
|
|
43
|
+
max_coeff : int
|
|
44
|
+
Maximum absolute value of the coefficients appearing in the data.
|
|
45
|
+
max_length : int, default ``512``
|
|
46
|
+
Hard upper bound on the token sequence length. Longer sequences will
|
|
47
|
+
be *right-truncated*.
|
|
48
|
+
processor_name : str, default ``"polynomial"``
|
|
49
|
+
Name of the processor to use for converting symbolic expressions into
|
|
50
|
+
internal token IDs. The default processor is ``"polynomial"``, which
|
|
51
|
+
handles polynomial expressions. The alternative processor is
|
|
52
|
+
``"integer"``, which handles integer expressions.
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
Tuple[StandardDataset, StandardTokenizer, StandardDataCollator]
|
|
57
|
+
1. ``dataset`` – a ``dict`` with ``"train"`` and ``"test"`` splits
|
|
58
|
+
containing :class:`StandardDataset` instances.
|
|
59
|
+
2. ``tokenizer`` – a :class:`PreTrainedTokenizerFast` capable of
|
|
60
|
+
encoding symbolic expressions into token IDs and vice versa.
|
|
61
|
+
3. ``data_collator`` – a callable that assembles batches and applies
|
|
62
|
+
dynamic padding so they can be fed to a HuggingFace ``Trainer``.
|
|
63
|
+
"""
|
|
64
|
+
if processor_name == "polynomial":
|
|
65
|
+
preprocessor = SymbolicToInternalProcessor(
|
|
66
|
+
num_variables=num_variables,
|
|
67
|
+
max_degree=max_degree,
|
|
68
|
+
max_coeff=max_coeff,
|
|
69
|
+
)
|
|
70
|
+
elif processor_name == "numeric":
|
|
71
|
+
preprocessor = IntegerToInternalProcessor(max_coeff=max_coeff)
|
|
72
|
+
else:
|
|
73
|
+
raise ValueError(f"Unknown processor: {processor_name}")
|
|
74
|
+
|
|
75
|
+
train_dataset = StandardDataset(train_dataset_path, preprocessor)
|
|
76
|
+
test_dataset = StandardDataset(test_dataset_path, preprocessor)
|
|
77
|
+
tokenizer = set_tokenizer(
|
|
78
|
+
num_vars=num_variables,
|
|
79
|
+
field=field,
|
|
80
|
+
max_degree=max_degree,
|
|
81
|
+
max_coeff=max_coeff,
|
|
82
|
+
max_length=max_length,
|
|
83
|
+
)
|
|
84
|
+
data_collator = StandardDataCollator(tokenizer)
|
|
85
|
+
dataset = {"train": train_dataset, "test": test_dataset}
|
|
86
|
+
return dataset, tokenizer, data_collator
|
|
File without changes
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
from transformers import PreTrainedTokenizerFast as Tokenizer
|
|
2
|
+
from .preprocessor import AbstractPreprocessor
|
|
3
|
+
from typing import Dict
|
|
4
|
+
from torch.utils.data import Dataset
|
|
5
|
+
import torch
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class StandardDataset(Dataset):
|
|
9
|
+
def __init__(self, data_path: str, preprocessor: AbstractPreprocessor) -> None:
|
|
10
|
+
self.data_path = data_path
|
|
11
|
+
self.input_texts = []
|
|
12
|
+
self.targets_texts = []
|
|
13
|
+
self.preprocessor = preprocessor
|
|
14
|
+
|
|
15
|
+
# Load and parse the data file
|
|
16
|
+
with open(self.data_path, "r", encoding="utf-8") as f:
|
|
17
|
+
for raw_line in f:
|
|
18
|
+
line = raw_line.strip()
|
|
19
|
+
if not line:
|
|
20
|
+
continue # Skip empty lines
|
|
21
|
+
|
|
22
|
+
# Split input and target expressions using "#" delimiter
|
|
23
|
+
if "#" not in line:
|
|
24
|
+
continue # Skip lines with unexpected format (no delimiter)
|
|
25
|
+
|
|
26
|
+
input_part, target_part = line.split("#", 1)
|
|
27
|
+
self.input_texts.append(input_part.strip())
|
|
28
|
+
self.targets_texts.append(target_part.strip())
|
|
29
|
+
|
|
30
|
+
def __getitem__(self, idx: int) -> Dict[str, str]:
|
|
31
|
+
"""Get dataset item and convert to internal representation.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
idx : int
|
|
36
|
+
Index of the item to retrieve
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
tuple
|
|
41
|
+
A pair (src, tgt) of preprocessed source and target
|
|
42
|
+
"""
|
|
43
|
+
src = self.preprocessor(self.input_texts[idx])
|
|
44
|
+
tgt = self.preprocessor(self.targets_texts[idx])
|
|
45
|
+
return {"input": src, "target": tgt}
|
|
46
|
+
|
|
47
|
+
def __len__(self) -> int:
|
|
48
|
+
return len(self.input_texts)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class StandardDataCollator:
|
|
52
|
+
def __init__(self, tokenizer: Tokenizer = None) -> None:
|
|
53
|
+
self.tokenizer = tokenizer
|
|
54
|
+
|
|
55
|
+
def _pad_sequences(self, sequences, padding_value=0):
|
|
56
|
+
"""Pads a list of sequences and converts them to a tensor."""
|
|
57
|
+
# Calculate the maximum length of the sequences.
|
|
58
|
+
max_length = max(len(seq) for seq in sequences)
|
|
59
|
+
|
|
60
|
+
# Apply padding.
|
|
61
|
+
padded_sequences = []
|
|
62
|
+
for seq in sequences:
|
|
63
|
+
padding_length = max_length - len(seq)
|
|
64
|
+
# Pad the sequence with the specified padding value.
|
|
65
|
+
padded_seq = seq + [padding_value] * padding_length
|
|
66
|
+
padded_sequences.append(padded_seq)
|
|
67
|
+
|
|
68
|
+
# '+2' for bos/eos tokens.
|
|
69
|
+
# Initialize a tensor of zeros with the appropriate shape.
|
|
70
|
+
padded = torch.zeros(len(sequences), max_length + 2, dtype=torch.long)
|
|
71
|
+
# Fill the tensor with the padded sequences, leaving space for BOS/EOS tokens.
|
|
72
|
+
padded[:, 1 : max_length + 1] = torch.tensor(padded_sequences)
|
|
73
|
+
|
|
74
|
+
return padded
|
|
75
|
+
|
|
76
|
+
def __call__(self, batch):
|
|
77
|
+
"""
|
|
78
|
+
Collates a batch of data samples.
|
|
79
|
+
If a tokenizer is provided, it tokenizes 'input' and 'target' attributes.
|
|
80
|
+
Other attributes starting with 'target_' are prefixed with 'decoder_' and padded.
|
|
81
|
+
"""
|
|
82
|
+
batch_dict = {}
|
|
83
|
+
|
|
84
|
+
# Get the attributes from the first item in the batch.
|
|
85
|
+
attributes = batch[0].keys()
|
|
86
|
+
|
|
87
|
+
if self.tokenizer is None:
|
|
88
|
+
# If no tokenizer is provided, return the batch as is.
|
|
89
|
+
for attribute in attributes:
|
|
90
|
+
attribute_batch = [item[attribute] for item in batch]
|
|
91
|
+
batch_dict[attribute] = attribute_batch
|
|
92
|
+
|
|
93
|
+
return batch_dict
|
|
94
|
+
|
|
95
|
+
for attribute in attributes:
|
|
96
|
+
attribute_batch = [item[attribute] for item in batch]
|
|
97
|
+
|
|
98
|
+
if attribute == "input":
|
|
99
|
+
# Tokenize the input sequences.
|
|
100
|
+
inputs = self.tokenizer(attribute_batch, padding="longest", return_tensors="pt")
|
|
101
|
+
batch_dict["input_ids"] = inputs["input_ids"]
|
|
102
|
+
batch_dict["attention_mask"] = inputs["attention_mask"]
|
|
103
|
+
|
|
104
|
+
elif attribute == "target":
|
|
105
|
+
# Tokenize the target sequences.
|
|
106
|
+
targets = self.tokenizer(attribute_batch, padding="longest", return_tensors="pt")
|
|
107
|
+
# Prepare decoder input ids (remove the last token, usually EOS).
|
|
108
|
+
batch_dict["decoder_input_ids"] = targets["input_ids"][:, :-1].contiguous()
|
|
109
|
+
# Prepare decoder attention mask accordingly.
|
|
110
|
+
batch_dict["decoder_attention_mask"] = targets["attention_mask"][:, :-1].contiguous()
|
|
111
|
+
|
|
112
|
+
# Prepare labels for the loss calculation (shift by one, usually remove BOS).
|
|
113
|
+
labels = targets["input_ids"][:, 1:].contiguous()
|
|
114
|
+
label_attention_mask = targets["attention_mask"][:, 1:].contiguous().bool()
|
|
115
|
+
# Set padding tokens in labels to -100 to be ignored by the loss function.
|
|
116
|
+
labels[~label_attention_mask] = -100
|
|
117
|
+
batch_dict["labels"] = labels
|
|
118
|
+
|
|
119
|
+
else:
|
|
120
|
+
# For other attributes, if they start with 'target_',
|
|
121
|
+
# prefix them with 'decoder_' (e.g., 'target_aux' becomes 'decoder_aux').
|
|
122
|
+
if attribute.startswith("target_"):
|
|
123
|
+
attribute_key = "decoder_" + attribute[7:] # Corrected key for batch_dict
|
|
124
|
+
else:
|
|
125
|
+
attribute_key = attribute # Use original attribute name if no prefix
|
|
126
|
+
# Pad the sequences for these attributes.
|
|
127
|
+
batch_dict[attribute_key] = self._pad_sequences(attribute_batch, padding_value=0)
|
|
128
|
+
|
|
129
|
+
return batch_dict
|