PyPI - mysphinx-forge - Versions diffs - 0.2.2__tar.gz → 0.2.3__tar.gz - Mend

mysphinx-forge 0.2.2tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

{mysphinx_forge-0.2.2 → mysphinx_forge-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mysphinx-forge
-Version: 0.2.2
+Version: 0.2.3
 Summary: Data and model workflow toolkit for cleaning, clustering, generation, and evaluation
 Keywords: data-cleaning,deduplication,clustering,nlp,cli
 Classifier: Development Status :: 3 - Alpha
@@ -487,10 +487,10 @@ input_deduplicated_split_train_pa_1.jsonl   # 前 10000 条
 input_deduplicated_split_train_pa_2.jsonl   # 后 2000 条
 ```
-通过 `--sft-pa-max-records-per-file` 可自定义阈值：
+通过 `--sft-max-records-per-file` 可自定义阈值：
 ```bash
-mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-pa-max-records-per-file 5000
+mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-max-records-per-file 5000
 ```
 说明：
@@ -555,6 +555,8 @@ mysphinx-forge --action split --input-file data/input_deduplicated.xlsx
 三者均为可选，可以同时存在，也可以只有其中一个或多个。
+`clean`、`deduplicate`、`clean-deduplicate`、`cluster` 这几个 `split` 之前的步骤会原样保留这三个特殊 sheet（不参与清洗/去重/聚类处理），并在输出文件中继续以独立 sheet 的形式存在，确保依次执行整条流水线后，`split` 仍能正确识别并注入这些数据。
 显式分层切分：
 ```bash

{mysphinx_forge-0.2.2 → mysphinx_forge-0.2.3}/README.md RENAMED Viewed

@@ -450,10 +450,10 @@ input_deduplicated_split_train_pa_1.jsonl   # 前 10000 条
 input_deduplicated_split_train_pa_2.jsonl   # 后 2000 条
 ```
-通过 `--sft-pa-max-records-per-file` 可自定义阈值：
+通过 `--sft-max-records-per-file` 可自定义阈值：
 ```bash
-mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-pa-max-records-per-file 5000
+mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-max-records-per-file 5000
 ```
 说明：
@@ -518,6 +518,8 @@ mysphinx-forge --action split --input-file data/input_deduplicated.xlsx
 三者均为可选，可以同时存在，也可以只有其中一个或多个。
+`clean`、`deduplicate`、`clean-deduplicate`、`cluster` 这几个 `split` 之前的步骤会原样保留这三个特殊 sheet（不参与清洗/去重/聚类处理），并在输出文件中继续以独立 sheet 的形式存在，确保依次执行整条流水线后，`split` 仍能正确识别并注入这些数据。
 显式分层切分：
 ```bash

{mysphinx_forge-0.2.2 → mysphinx_forge-0.2.3}/mysphinx_forge/cli.py RENAMED Viewed

@@ -40,6 +40,7 @@ from mysphinx_forge.file_io import (
     load_dataframe,
     load_split_dataframes,
     write_dataframe,
+    write_dataframe_with_injection_sheets,
     write_match_rows,
 )
 from mysphinx_forge import __version__
@@ -312,7 +313,7 @@ def main() -> int:
             resolved_sft_system_prompt,
             args.sft_system_column,
             args.sft_user_query_as_instruction,
-            args.sft_pa_max_records_per_file,
+            args.sft_max_records_per_file,
         )
     parser.print_help()
@@ -715,10 +716,10 @@ def _build_parser(
         help="为 true 时将用户输入作为 alpaca instruction 字段，input 字段留空；为 false 时保持原有行为（input 存用户输入，instruction 为固定文本）。默认 true。",
     )
     parser.add_argument(
-        "--sft-pa-max-records-per-file",
+        "--sft-max-records-per-file",
         type=int,
-        dest="sft_pa_max_records_per_file",
-        default=config_defaults.get("sft_pa_max_records_per_file", PA_MAX_RECORDS_PER_FILE),
+        dest="sft_max_records_per_file",
+        default=config_defaults.get("sft_max_records_per_file", PA_MAX_RECORDS_PER_FILE),
         help=f"pa 格式每个 JSONL 文件最大记录数，超出时自动切分为多个文件，默认 {PA_MAX_RECORDS_PER_FILE}。",
     )
     return parser
@@ -758,7 +759,7 @@ def _run_clean(
     try:
         run_stage("读取文件", logger=logger)
-        dataframe = load_dataframe(input_file)
+        dataframe, train_inject_df, valid_inject_df, test_inject_df = load_split_dataframes(input_file)
     except ValueError as exc:
         _emit_error(str(exc), logger)
         close_logger()
@@ -784,7 +785,13 @@ def _run_clean(
         progress_bar.close()
     run_stage("写出结果", logger=logger)
-    write_dataframe(cleaned, output_path)
+    write_dataframe_with_injection_sheets(
+        cleaned,
+        output_path,
+        train_inject=train_inject_df,
+        valid_inject=valid_inject_df,
+        test_inject=test_inject_df,
+    )
     _write_meta(
         output_path=output_path,
         action="clean",
@@ -872,7 +879,7 @@ def _run_deduplicate(
     try:
         run_stage("读取文件", logger=logger)
-        dataframe = load_dataframe(input_file)
+        dataframe, train_inject_df, valid_inject_df, test_inject_df = load_split_dataframes(input_file)
         progress_bar = ProgressBar(total=len(dataframe), description="执行去重", logger=logger)
         try:
             deduplicated, stats, match_rows = _deduplicate_dataframe(
@@ -904,7 +911,13 @@ def _run_deduplicate(
         return 1
     run_stage("写出结果", logger=logger)
-    write_dataframe(deduplicated, output_path)
+    write_dataframe_with_injection_sheets(
+        deduplicated,
+        output_path,
+        train_inject=train_inject_df,
+        valid_inject=valid_inject_df,
+        test_inject=test_inject_df,
+    )
     write_match_rows(
         match_rows,
         _resolve_match_output_path(output_path),
@@ -979,7 +992,7 @@ def _run_clean_deduplicate(
     try:
         run_stage("读取文件", logger=logger)
-        dataframe = load_dataframe(input_file)
+        dataframe, train_inject_df, valid_inject_df, test_inject_df = load_split_dataframes(input_file)
         clean_bar = ProgressBar(total=len(dataframe), description="清洗数据", logger=logger)
         try:
             cleaned, clean_stats = clean_dataframe(
@@ -1030,7 +1043,13 @@ def _run_clean_deduplicate(
         return 1
     run_stage("写出结果", logger=logger)
-    write_dataframe(deduplicated, output_path)
+    write_dataframe_with_injection_sheets(
+        deduplicated,
+        output_path,
+        train_inject=train_inject_df,
+        valid_inject=valid_inject_df,
+        test_inject=test_inject_df,
+    )
     write_match_rows(
         match_rows,
         _resolve_match_output_path(output_path),
@@ -1103,7 +1122,7 @@ def _run_cluster(
     try:
         run_stage("读取文件", logger=logger)
-        dataframe = load_dataframe(input_file)
+        dataframe, train_inject_df, valid_inject_df, test_inject_df = load_split_dataframes(input_file)
         progress_bar = ProgressBar(total=len(dataframe), description="执行聚类", logger=logger)
         try:
             clustered, cluster_summary, projection, stats = clusterer.cluster_dataframe(
@@ -1133,7 +1152,13 @@ def _run_cluster(
     run_stage("写出结果", logger=logger)
     analysis_report = build_cluster_analysis_report(cluster_summary, stats)
-    write_dataframe(clustered, output_path)
+    write_dataframe_with_injection_sheets(
+        clustered,
+        output_path,
+        train_inject=train_inject_df,
+        valid_inject=valid_inject_df,
+        test_inject=test_inject_df,
+    )
     write_dataframe(cluster_summary, cluster_summary_path)
     write_dataframe(projection, projection_path)
     write_dataframe(analysis_report, analysis_path)
@@ -1471,18 +1496,19 @@ def _run_split(
         close_logger()
         return 1
+    has_test_output = test_ratio > 0 or stats.inject_test_rows > 0
     run_stage("写出 train", logger=logger)
     write_dataframe(train_df, train_output_path)
     run_stage("写出 valid", logger=logger)
     write_dataframe(validation_df, validation_output_path)
-    if test_ratio > 0:
+    if has_test_output:
         run_stage("写出 test", logger=logger)
         write_dataframe(test_df, test_output_path)
     extra_output_files: dict[str, Path] = {
         "train_file": train_output_path,
         "validation_file": validation_output_path,
     }
-    if test_ratio > 0:
+    if has_test_output:
         extra_output_files["test_file"] = test_output_path
     _write_meta(
         output_path=base_output_path,
@@ -1509,7 +1535,7 @@ def _run_split(
         stats,
         train_output_path=train_output_path,
         validation_output_path=validation_output_path,
-        test_output_path=test_output_path if test_ratio > 0 else None,
+        test_output_path=test_output_path if has_test_output else None,
         logger=logger,
     )
     close_logger()
@@ -1818,7 +1844,7 @@ def _run_convert_sft(
     sft_system_prompt: str,
     sft_system_column: str,
     sft_user_query_as_instruction: bool = True,
-    sft_pa_max_records_per_file: int = PA_MAX_RECORDS_PER_FILE,
+    sft_max_records_per_file: int = PA_MAX_RECORDS_PER_FILE,
 ) -> int:
     input_path = Path(input_file)
     output_path = _resolve_sft_output_path(input_path, output_arg, sft_format)
@@ -1863,7 +1889,7 @@ def _run_convert_sft(
     run_stage("写出结果", logger=logger)
     if sft_format == PA_SFT_FORMAT:
         written_paths = write_pa_dataset(
-            records, output_path, max_records_per_file=sft_pa_max_records_per_file
+            records, output_path, max_records_per_file=sft_max_records_per_file
         )
     else:
         write_alpaca_dataset(records, output_path)
@@ -1880,7 +1906,7 @@ def _run_convert_sft(
             "sft_system_prompt": sft_system_prompt,
             "sft_system_column": sft_system_column,
             "sft_user_query_as_instruction": sft_user_query_as_instruction,
-            "sft_pa_max_records_per_file": sft_pa_max_records_per_file,
+            "sft_max_records_per_file": sft_max_records_per_file,
         },
         sft_conversion_stats=stats,
         extra_output_files={f"output_file_{i + 1}": p for i, p in enumerate(written_paths)}

{mysphinx_forge-0.2.2 → mysphinx_forge-0.2.3}/mysphinx_forge/file_io.py RENAMED Viewed

@@ -129,6 +129,33 @@ def write_dataframe(dataframe: pd.DataFrame, output_path: str | Path) -> None:
     dataframe.to_excel(path, index=False)
+def write_dataframe_with_injection_sheets(
+    dataframe: pd.DataFrame,
+    output_path: str | Path,
+    *,
+    train_inject: pd.DataFrame | None = None,
+    valid_inject: pd.DataFrame | None = None,
+    test_inject: pd.DataFrame | None = None,
+) -> None:
+    path = Path(output_path)
+    if path.suffix.lower() == ".csv":
+        dataframe.to_csv(path, index=False)
+        return
+    sheets = {"Sheet1": dataframe}
+    for sheet_name, frame in (
+        (TRAIN_SHEET_NAME, train_inject),
+        (VALID_SHEET_NAME, valid_inject),
+        (TEST_SHEET_NAME, test_inject),
+    ):
+        if frame is not None and not frame.empty:
+            sheets[sheet_name] = frame
+    with pd.ExcelWriter(path) as writer:
+        for sheet_name, frame in sheets.items():
+            frame.to_excel(writer, sheet_name=sheet_name, index=False)
 def write_progress_message(message: str, *, stream: TextIO | None = None) -> None:
     output_stream = stream or sys.stderr
     with tqdm.external_write_mode(file=output_stream):

{mysphinx_forge-0.2.2 → mysphinx_forge-0.2.3}/mysphinx_forge/templates/mysphinx-forge.yaml RENAMED Viewed

@@ -409,3 +409,7 @@ convert-sft:
   # 按行读取的 system 列。
   # 若提供且该行非空，则优先使用该列值覆盖 sft_system_prompt。
   sft_system_column: ""
+  # pa 格式每个 JSONL 文件最大记录数，超出时自动切分为多个文件。
+  # 仅 sft_format=pa 时生效。
+  sft_max_records_per_file: 10000

{mysphinx_forge-0.2.2 → mysphinx_forge-0.2.3}/mysphinx_forge.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mysphinx-forge
-Version: 0.2.2
+Version: 0.2.3
 Summary: Data and model workflow toolkit for cleaning, clustering, generation, and evaluation
 Keywords: data-cleaning,deduplication,clustering,nlp,cli
 Classifier: Development Status :: 3 - Alpha
@@ -487,10 +487,10 @@ input_deduplicated_split_train_pa_1.jsonl   # 前 10000 条
 input_deduplicated_split_train_pa_2.jsonl   # 后 2000 条
 ```
-通过 `--sft-pa-max-records-per-file` 可自定义阈值：
+通过 `--sft-max-records-per-file` 可自定义阈值：
 ```bash
-mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-pa-max-records-per-file 5000
+mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-max-records-per-file 5000
 ```
 说明：
@@ -555,6 +555,8 @@ mysphinx-forge --action split --input-file data/input_deduplicated.xlsx
 三者均为可选，可以同时存在，也可以只有其中一个或多个。
+`clean`、`deduplicate`、`clean-deduplicate`、`cluster` 这几个 `split` 之前的步骤会原样保留这三个特殊 sheet（不参与清洗/去重/聚类处理），并在输出文件中继续以独立 sheet 的形式存在，确保依次执行整条流水线后，`split` 仍能正确识别并注入这些数据。
 显式分层切分：
 ```bash

{mysphinx_forge-0.2.2 → mysphinx_forge-0.2.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "mysphinx-forge"
-version = "0.2.2"
+version = "0.2.3"
 description = "Data and model workflow toolkit for cleaning, clustering, generation, and evaluation"
 readme = "README.md"
 requires-python = ">=3.12"

{mysphinx_forge-0.2.2 → mysphinx_forge-0.2.3}/tests/test_cli.py RENAMED Viewed

@@ -387,7 +387,7 @@ def test_main_cli_overrides_config_values(tmp_path, monkeypatch, capsys) -> None
     assert meta["parameters"]["test_ratio"] == 0.1
-def test_main_split_injects_increment_sheet_into_train_and_valid(
+def test_main_split_injects_train_valid_test_sheets_exclusively(
     tmp_path, monkeypatch, capsys
 ) -> None:
     input_file = tmp_path / "input.xlsx"
@@ -400,10 +400,22 @@ def test_main_split_injects_increment_sheet_into_train_and_valid(
         ).to_excel(writer, sheet_name="base_a", index=False)
         pd.DataFrame(
             {
-                "text": ["增量问题1", "增量问题2"],
-                "category": ["增量", "增量"],
+                "text": ["训练注入1"],
+                "category": ["增量"],
             }
-        ).to_excel(writer, sheet_name="increment", index=False)
+        ).to_excel(writer, sheet_name="Train", index=False)
+        pd.DataFrame(
+            {
+                "text": ["验证注入1"],
+                "category": ["增量"],
+            }
+        ).to_excel(writer, sheet_name="valid", index=False)
+        pd.DataFrame(
+            {
+                "text": ["测试注入1"],
+                "category": ["增量"],
+            }
+        ).to_excel(writer, sheet_name="TEST", index=False)
     monkeypatch.setattr(
         sys,
@@ -425,23 +437,198 @@ def test_main_split_injects_increment_sheet_into_train_and_valid(
     captured = capsys.readouterr()
     assert exit_code == 0
-    assert "增量工作表：increment" in captured.out
-    assert "增量注入行数：2" in captured.out
+    assert "注入训练集行数（'train' sheet）：1" in captured.out
+    assert "注入验证集行数（'valid' sheet）：1" in captured.out
+    assert "注入测试集行数（'test' sheet）：1" in captured.out
     train = pd.read_excel(tmp_path / "input_split_train.xlsx")
     valid = pd.read_excel(tmp_path / "input_split_valid.xlsx")
     test = pd.read_excel(tmp_path / "input_split_test.xlsx")
-    assert set(train["text"].tolist()) >= {"增量问题1", "增量问题2"}
-    assert set(valid["text"].tolist()) >= {"增量问题1", "增量问题2"}
-    assert "增量问题1" not in test["text"].tolist()
-    assert "增量问题2" not in test["text"].tolist()
+    assert "训练注入1" in train["text"].tolist()
+    assert "训练注入1" not in valid["text"].tolist()
+    assert "训练注入1" not in test["text"].tolist()
+    assert "验证注入1" in valid["text"].tolist()
+    assert "验证注入1" not in train["text"].tolist()
+    assert "验证注入1" not in test["text"].tolist()
+    assert "测试注入1" in test["text"].tolist()
+    assert "测试注入1" not in train["text"].tolist()
+    assert "测试注入1" not in valid["text"].tolist()
+    meta = json.loads((tmp_path / "input_split.meta.json").read_text(encoding="utf-8"))
+    assert meta["parameters"]["inject_train_rows"] == 1
+    assert meta["parameters"]["inject_valid_rows"] == 1
+    assert meta["parameters"]["inject_test_rows"] == 1
+    assert meta["split_stats"]["inject_train_rows"] == 1
+    assert meta["split_stats"]["inject_valid_rows"] == 1
+    assert meta["split_stats"]["inject_test_rows"] == 1
+def test_main_split_writes_test_file_when_test_ratio_zero_but_test_sheet_injected(
+    tmp_path, monkeypatch, capsys
+) -> None:
+    input_file = tmp_path / "input.xlsx"
+    with pd.ExcelWriter(input_file) as writer:
+        pd.DataFrame(
+            {
+                "text": [f"问题{i}" for i in range(6)],
+                "category": ["基金"] * 3 + ["股票"] * 3,
+            }
+        ).to_excel(writer, sheet_name="base_a", index=False)
+        pd.DataFrame(
+            {
+                "text": ["测试注入1"],
+                "category": ["增量"],
+            }
+        ).to_excel(writer, sheet_name="test", index=False)
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "main.py",
+            "--action",
+            "split",
+            "--input-file",
+            str(input_file),
+            "--validation-ratio",
+            "0.2",
+            "--test-ratio",
+            "0",
+        ],
+    )
+    exit_code = main()
+    captured = capsys.readouterr()
+    assert exit_code == 0
+    assert "注入测试集行数（'test' sheet）：1" in captured.out
+    test_output_path = tmp_path / "input_split_test.xlsx"
+    assert test_output_path.exists()
+    test = pd.read_excel(test_output_path)
+    assert "测试注入1" in test["text"].tolist()
     meta = json.loads((tmp_path / "input_split.meta.json").read_text(encoding="utf-8"))
-    assert meta["parameters"]["increment_sheet_name"] == "increment"
-    assert meta["parameters"]["increment_rows"] == 2
-    assert meta["split_stats"]["increment_sheet_name"] == "increment"
-    assert meta["split_stats"]["increment_rows"] == 2
+    assert meta["parameters"]["inject_test_rows"] == 1
+    assert "test_file" in meta["output_files"]
+def test_main_clean_preserves_injection_sheets_for_downstream_split(
+    tmp_path, monkeypatch, capsys
+) -> None:
+    input_file = tmp_path / "input.xlsx"
+    with pd.ExcelWriter(input_file) as writer:
+        pd.DataFrame(
+            {
+                "text": [f"问题{i}" for i in range(6)] + ["!!!"],
+                "category": ["基金"] * 3 + ["股票"] * 3 + ["噪音"],
+            }
+        ).to_excel(writer, sheet_name="base_a", index=False)
+        pd.DataFrame({"text": ["训练注入1"], "category": ["增量"]}).to_excel(
+            writer, sheet_name="train", index=False
+        )
+        pd.DataFrame({"text": ["测试注入1"], "category": ["增量"]}).to_excel(
+            writer, sheet_name="test", index=False
+        )
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        ["main.py", "--action", "clean", "--input-file", str(input_file)],
+    )
+    exit_code = main()
+    captured = capsys.readouterr()
+    assert exit_code == 0
+    assert "清洗后总行数：6" in captured.out
+    output_file = tmp_path / "input_cleaned.xlsx"
+    sheets = pd.read_excel(output_file, sheet_name=None)
+    assert "!!!" not in sheets["Sheet1"]["text"].tolist()
+    assert sheets["train"]["text"].tolist() == ["训练注入1"]
+    assert sheets["test"]["text"].tolist() == ["测试注入1"]
+def test_main_deduplicate_preserves_injection_sheets_for_downstream_split(
+    tmp_path, monkeypatch, capsys
+) -> None:
+    input_file = tmp_path / "input.xlsx"
+    with pd.ExcelWriter(input_file) as writer:
+        pd.DataFrame(
+            {
+                "text": ["问题1", "问题1", "问题2"],
+                "category": ["基金", "基金", "股票"],
+            }
+        ).to_excel(writer, sheet_name="base_a", index=False)
+        pd.DataFrame({"text": ["测试注入1"], "category": ["增量"]}).to_excel(
+            writer, sheet_name="test", index=False
+        )
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        ["main.py", "--action", "deduplicate", "--input-file", str(input_file)],
+    )
+    exit_code = main()
+    captured = capsys.readouterr()
+    assert exit_code == 0
+    output_file = tmp_path / "input_deduplicated.xlsx"
+    sheets = pd.read_excel(output_file, sheet_name=None)
+    assert sorted(sheets["Sheet1"]["text"].tolist()) == ["问题1", "问题2"]
+    assert sheets["test"]["text"].tolist() == ["测试注入1"]
+def test_main_split_after_clean_includes_test_sheet_rows_in_test_output(
+    tmp_path, monkeypatch, capsys
+) -> None:
+    input_file = tmp_path / "input.xlsx"
+    with pd.ExcelWriter(input_file) as writer:
+        pd.DataFrame(
+            {
+                "text": [f"问题{i}" for i in range(8)],
+                "category": ["基金"] * 4 + ["股票"] * 4,
+            }
+        ).to_excel(writer, sheet_name="base_a", index=False)
+        pd.DataFrame({"text": ["测试注入1"], "category": ["增量"]}).to_excel(
+            writer, sheet_name="test", index=False
+        )
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        ["main.py", "--action", "clean", "--input-file", str(input_file)],
+    )
+    assert main() == 0
+    capsys.readouterr()
+    cleaned_file = tmp_path / "input_cleaned.xlsx"
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "main.py",
+            "--action",
+            "split",
+            "--input-file",
+            str(cleaned_file),
+            "--validation-ratio",
+            "0.2",
+            "--test-ratio",
+            "0.2",
+        ],
+    )
+    exit_code = main()
+    captured = capsys.readouterr()
+    assert exit_code == 0
+    assert "注入测试集行数（'test' sheet）：1" in captured.out
+    test_output = pd.read_excel(tmp_path / "input_cleaned_split_test.xlsx")
+    assert "测试注入1" in test_output["text"].tolist()
 def test_main_split_rejects_missing_group_column(tmp_path, monkeypatch, capsys) -> None: