mysphinx-forge 0.2.2__tar.gz → 0.2.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/PKG-INFO +3 -3
  2. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/README.md +2 -2
  3. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/cli.py +7 -7
  4. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/templates/mysphinx-forge.yaml +4 -0
  5. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge.egg-info/PKG-INFO +3 -3
  6. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/pyproject.toml +1 -1
  7. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/__init__.py +0 -0
  8. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/cleaning.py +0 -0
  9. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/cluster_labeling.py +0 -0
  10. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/cluster_reporting.py +0 -0
  11. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/clustering.py +0 -0
  12. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/config.py +0 -0
  13. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/deduplication.py +0 -0
  14. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/embedding.py +0 -0
  15. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/env_utils.py +0 -0
  16. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/file_io.py +0 -0
  17. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/http_client.py +0 -0
  18. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/logging_utils.py +0 -0
  19. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/model_eval.py +0 -0
  20. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/model_testing.py +0 -0
  21. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/openai_responses.py +0 -0
  22. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/progress.py +0 -0
  23. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/semantic_deduplication.py +0 -0
  24. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/sft_dataset.py +0 -0
  25. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/splitting.py +0 -0
  26. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge/templates/__init__.py +0 -0
  27. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge.egg-info/SOURCES.txt +0 -0
  28. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge.egg-info/dependency_links.txt +0 -0
  29. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge.egg-info/entry_points.txt +0 -0
  30. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge.egg-info/requires.txt +0 -0
  31. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/mysphinx_forge.egg-info/top_level.txt +0 -0
  32. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/setup.cfg +0 -0
  33. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_cleaning.py +0 -0
  34. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_cli.py +0 -0
  35. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_cluster_labeling.py +0 -0
  36. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_cluster_reporting.py +0 -0
  37. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_clustering.py +0 -0
  38. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_deduplication.py +0 -0
  39. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_file_io.py +0 -0
  40. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_http_client.py +0 -0
  41. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_model_eval.py +0 -0
  42. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_model_testing.py +0 -0
  43. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_semantic_deduplication.py +0 -0
  44. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_sft_cli.py +0 -0
  45. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_sft_dataset.py +0 -0
  46. {mysphinx_forge-0.2.2 → mysphinx_forge-0.2.2.1}/tests/test_splitting.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mysphinx-forge
3
- Version: 0.2.2
3
+ Version: 0.2.2.1
4
4
  Summary: Data and model workflow toolkit for cleaning, clustering, generation, and evaluation
5
5
  Keywords: data-cleaning,deduplication,clustering,nlp,cli
6
6
  Classifier: Development Status :: 3 - Alpha
@@ -487,10 +487,10 @@ input_deduplicated_split_train_pa_1.jsonl # 前 10000 条
487
487
  input_deduplicated_split_train_pa_2.jsonl # 后 2000 条
488
488
  ```
489
489
 
490
- 通过 `--sft-pa-max-records-per-file` 可自定义阈值:
490
+ 通过 `--sft-max-records-per-file` 可自定义阈值:
491
491
 
492
492
  ```bash
493
- mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-pa-max-records-per-file 5000
493
+ mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-max-records-per-file 5000
494
494
  ```
495
495
 
496
496
  说明:
@@ -450,10 +450,10 @@ input_deduplicated_split_train_pa_1.jsonl # 前 10000 条
450
450
  input_deduplicated_split_train_pa_2.jsonl # 后 2000 条
451
451
  ```
452
452
 
453
- 通过 `--sft-pa-max-records-per-file` 可自定义阈值:
453
+ 通过 `--sft-max-records-per-file` 可自定义阈值:
454
454
 
455
455
  ```bash
456
- mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-pa-max-records-per-file 5000
456
+ mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-max-records-per-file 5000
457
457
  ```
458
458
 
459
459
  说明:
@@ -312,7 +312,7 @@ def main() -> int:
312
312
  resolved_sft_system_prompt,
313
313
  args.sft_system_column,
314
314
  args.sft_user_query_as_instruction,
315
- args.sft_pa_max_records_per_file,
315
+ args.sft_max_records_per_file,
316
316
  )
317
317
 
318
318
  parser.print_help()
@@ -715,10 +715,10 @@ def _build_parser(
715
715
  help="为 true 时将用户输入作为 alpaca instruction 字段,input 字段留空;为 false 时保持原有行为(input 存用户输入,instruction 为固定文本)。默认 true。",
716
716
  )
717
717
  parser.add_argument(
718
- "--sft-pa-max-records-per-file",
718
+ "--sft-max-records-per-file",
719
719
  type=int,
720
- dest="sft_pa_max_records_per_file",
721
- default=config_defaults.get("sft_pa_max_records_per_file", PA_MAX_RECORDS_PER_FILE),
720
+ dest="sft_max_records_per_file",
721
+ default=config_defaults.get("sft_max_records_per_file", PA_MAX_RECORDS_PER_FILE),
722
722
  help=f"pa 格式每个 JSONL 文件最大记录数,超出时自动切分为多个文件,默认 {PA_MAX_RECORDS_PER_FILE}。",
723
723
  )
724
724
  return parser
@@ -1818,7 +1818,7 @@ def _run_convert_sft(
1818
1818
  sft_system_prompt: str,
1819
1819
  sft_system_column: str,
1820
1820
  sft_user_query_as_instruction: bool = True,
1821
- sft_pa_max_records_per_file: int = PA_MAX_RECORDS_PER_FILE,
1821
+ sft_max_records_per_file: int = PA_MAX_RECORDS_PER_FILE,
1822
1822
  ) -> int:
1823
1823
  input_path = Path(input_file)
1824
1824
  output_path = _resolve_sft_output_path(input_path, output_arg, sft_format)
@@ -1863,7 +1863,7 @@ def _run_convert_sft(
1863
1863
  run_stage("写出结果", logger=logger)
1864
1864
  if sft_format == PA_SFT_FORMAT:
1865
1865
  written_paths = write_pa_dataset(
1866
- records, output_path, max_records_per_file=sft_pa_max_records_per_file
1866
+ records, output_path, max_records_per_file=sft_max_records_per_file
1867
1867
  )
1868
1868
  else:
1869
1869
  write_alpaca_dataset(records, output_path)
@@ -1880,7 +1880,7 @@ def _run_convert_sft(
1880
1880
  "sft_system_prompt": sft_system_prompt,
1881
1881
  "sft_system_column": sft_system_column,
1882
1882
  "sft_user_query_as_instruction": sft_user_query_as_instruction,
1883
- "sft_pa_max_records_per_file": sft_pa_max_records_per_file,
1883
+ "sft_max_records_per_file": sft_max_records_per_file,
1884
1884
  },
1885
1885
  sft_conversion_stats=stats,
1886
1886
  extra_output_files={f"output_file_{i + 1}": p for i, p in enumerate(written_paths)}
@@ -409,3 +409,7 @@ convert-sft:
409
409
  # 按行读取的 system 列。
410
410
  # 若提供且该行非空,则优先使用该列值覆盖 sft_system_prompt。
411
411
  sft_system_column: ""
412
+
413
+ # pa 格式每个 JSONL 文件最大记录数,超出时自动切分为多个文件。
414
+ # 仅 sft_format=pa 时生效。
415
+ sft_max_records_per_file: 10000
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mysphinx-forge
3
- Version: 0.2.2
3
+ Version: 0.2.2.1
4
4
  Summary: Data and model workflow toolkit for cleaning, clustering, generation, and evaluation
5
5
  Keywords: data-cleaning,deduplication,clustering,nlp,cli
6
6
  Classifier: Development Status :: 3 - Alpha
@@ -487,10 +487,10 @@ input_deduplicated_split_train_pa_1.jsonl # 前 10000 条
487
487
  input_deduplicated_split_train_pa_2.jsonl # 后 2000 条
488
488
  ```
489
489
 
490
- 通过 `--sft-pa-max-records-per-file` 可自定义阈值:
490
+ 通过 `--sft-max-records-per-file` 可自定义阈值:
491
491
 
492
492
  ```bash
493
- mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-pa-max-records-per-file 5000
493
+ mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-max-records-per-file 5000
494
494
  ```
495
495
 
496
496
  说明:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "mysphinx-forge"
7
- version = "0.2.2"
7
+ version = "0.2.2.1"
8
8
  description = "Data and model workflow toolkit for cleaning, clustering, generation, and evaluation"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"