mysphinx-forge 0.2.2__tar.gz → 0.2.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/PKG-INFO +3 -3
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/README.md +2 -2
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/cli.py +7 -7
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/templates/mysphinx-forge.yaml +4 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge.egg-info/PKG-INFO +3 -3
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/pyproject.toml +1 -1
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/__init__.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/cleaning.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/cluster_labeling.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/cluster_reporting.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/clustering.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/config.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/deduplication.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/embedding.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/env_utils.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/file_io.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/http_client.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/logging_utils.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/model_eval.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/model_testing.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/openai_responses.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/progress.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/semantic_deduplication.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/sft_dataset.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/splitting.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/templates/__init__.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge.egg-info/SOURCES.txt +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge.egg-info/dependency_links.txt +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge.egg-info/entry_points.txt +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge.egg-info/requires.txt +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge.egg-info/top_level.txt +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/setup.cfg +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_cleaning.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_cli.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_cluster_labeling.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_cluster_reporting.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_clustering.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_deduplication.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_file_io.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_http_client.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_model_eval.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_model_testing.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_semantic_deduplication.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_sft_cli.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_sft_dataset.py +0 -0
- {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_splitting.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mysphinx-forge
|
|
3
|
-
Version: 0.2.2
|
|
3
|
+
Version: 0.2.2.1
|
|
4
4
|
Summary: Data and model workflow toolkit for cleaning, clustering, generation, and evaluation
|
|
5
5
|
Keywords: data-cleaning,deduplication,clustering,nlp,cli
|
|
6
6
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -487,10 +487,10 @@ input_deduplicated_split_train_pa_1.jsonl # 前 10000 条
|
|
|
487
487
|
input_deduplicated_split_train_pa_2.jsonl # 后 2000 条
|
|
488
488
|
```
|
|
489
489
|
|
|
490
|
-
通过 `--sft-
|
|
490
|
+
通过 `--sft-max-records-per-file` 可自定义阈值:
|
|
491
491
|
|
|
492
492
|
```bash
|
|
493
|
-
mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-
|
|
493
|
+
mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-max-records-per-file 5000
|
|
494
494
|
```
|
|
495
495
|
|
|
496
496
|
说明:
|
|
@@ -450,10 +450,10 @@ input_deduplicated_split_train_pa_1.jsonl # 前 10000 条
|
|
|
450
450
|
input_deduplicated_split_train_pa_2.jsonl # 后 2000 条
|
|
451
451
|
```
|
|
452
452
|
|
|
453
|
-
通过 `--sft-
|
|
453
|
+
通过 `--sft-max-records-per-file` 可自定义阈值:
|
|
454
454
|
|
|
455
455
|
```bash
|
|
456
|
-
mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-
|
|
456
|
+
mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-max-records-per-file 5000
|
|
457
457
|
```
|
|
458
458
|
|
|
459
459
|
说明:
|
|
@@ -312,7 +312,7 @@ def main() -> int:
|
|
|
312
312
|
resolved_sft_system_prompt,
|
|
313
313
|
args.sft_system_column,
|
|
314
314
|
args.sft_user_query_as_instruction,
|
|
315
|
-
args.
|
|
315
|
+
args.sft_max_records_per_file,
|
|
316
316
|
)
|
|
317
317
|
|
|
318
318
|
parser.print_help()
|
|
@@ -715,10 +715,10 @@ def _build_parser(
|
|
|
715
715
|
help="为 true 时将用户输入作为 alpaca instruction 字段,input 字段留空;为 false 时保持原有行为(input 存用户输入,instruction 为固定文本)。默认 true。",
|
|
716
716
|
)
|
|
717
717
|
parser.add_argument(
|
|
718
|
-
"--sft-
|
|
718
|
+
"--sft-max-records-per-file",
|
|
719
719
|
type=int,
|
|
720
|
-
dest="
|
|
721
|
-
default=config_defaults.get("
|
|
720
|
+
dest="sft_max_records_per_file",
|
|
721
|
+
default=config_defaults.get("sft_max_records_per_file", PA_MAX_RECORDS_PER_FILE),
|
|
722
722
|
help=f"pa 格式每个 JSONL 文件最大记录数,超出时自动切分为多个文件,默认 {PA_MAX_RECORDS_PER_FILE}。",
|
|
723
723
|
)
|
|
724
724
|
return parser
|
|
@@ -1818,7 +1818,7 @@ def _run_convert_sft(
|
|
|
1818
1818
|
sft_system_prompt: str,
|
|
1819
1819
|
sft_system_column: str,
|
|
1820
1820
|
sft_user_query_as_instruction: bool = True,
|
|
1821
|
-
|
|
1821
|
+
sft_max_records_per_file: int = PA_MAX_RECORDS_PER_FILE,
|
|
1822
1822
|
) -> int:
|
|
1823
1823
|
input_path = Path(input_file)
|
|
1824
1824
|
output_path = _resolve_sft_output_path(input_path, output_arg, sft_format)
|
|
@@ -1863,7 +1863,7 @@ def _run_convert_sft(
|
|
|
1863
1863
|
run_stage("写出结果", logger=logger)
|
|
1864
1864
|
if sft_format == PA_SFT_FORMAT:
|
|
1865
1865
|
written_paths = write_pa_dataset(
|
|
1866
|
-
records, output_path, max_records_per_file=
|
|
1866
|
+
records, output_path, max_records_per_file=sft_max_records_per_file
|
|
1867
1867
|
)
|
|
1868
1868
|
else:
|
|
1869
1869
|
write_alpaca_dataset(records, output_path)
|
|
@@ -1880,7 +1880,7 @@ def _run_convert_sft(
|
|
|
1880
1880
|
"sft_system_prompt": sft_system_prompt,
|
|
1881
1881
|
"sft_system_column": sft_system_column,
|
|
1882
1882
|
"sft_user_query_as_instruction": sft_user_query_as_instruction,
|
|
1883
|
-
"
|
|
1883
|
+
"sft_max_records_per_file": sft_max_records_per_file,
|
|
1884
1884
|
},
|
|
1885
1885
|
sft_conversion_stats=stats,
|
|
1886
1886
|
extra_output_files={f"output_file_{i + 1}": p for i, p in enumerate(written_paths)}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mysphinx-forge
|
|
3
|
-
Version: 0.2.2
|
|
3
|
+
Version: 0.2.2.1
|
|
4
4
|
Summary: Data and model workflow toolkit for cleaning, clustering, generation, and evaluation
|
|
5
5
|
Keywords: data-cleaning,deduplication,clustering,nlp,cli
|
|
6
6
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -487,10 +487,10 @@ input_deduplicated_split_train_pa_1.jsonl # 前 10000 条
|
|
|
487
487
|
input_deduplicated_split_train_pa_2.jsonl # 后 2000 条
|
|
488
488
|
```
|
|
489
489
|
|
|
490
|
-
通过 `--sft-
|
|
490
|
+
通过 `--sft-max-records-per-file` 可自定义阈值:
|
|
491
491
|
|
|
492
492
|
```bash
|
|
493
|
-
mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-
|
|
493
|
+
mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-max-records-per-file 5000
|
|
494
494
|
```
|
|
495
495
|
|
|
496
496
|
说明:
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "mysphinx-forge"
|
|
7
|
-
version = "0.2.2"
|
|
7
|
+
version = "0.2.2.1"
|
|
8
8
|
description = "Data and model workflow toolkit for cleaning, clustering, generation, and evaluation"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.12"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|