PyPI - mlx-raclate - Versions diffs - 0.1.0b1__py3-none-any.whl - Mend

mlx-raclate 0.1.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

mlx_raclate/__init__.py +1 -0
mlx_raclate/models/__init__.py +0 -0
mlx_raclate/models/base.py +225 -0
mlx_raclate/models/gemma3_text.py +913 -0
mlx_raclate/models/lfm2.py +671 -0
mlx_raclate/models/modernbert.py +900 -0
mlx_raclate/models/qwen3.py +582 -0
mlx_raclate/models/t5gemma_encoder.py +857 -0
mlx_raclate/py.typed +0 -0
mlx_raclate/tuner/TUNER.md +305 -0
mlx_raclate/tuner/__init__.py +0 -0
mlx_raclate/tuner/collators.py +291 -0
mlx_raclate/tuner/datasets.py +247 -0
mlx_raclate/tuner/model_card_utils.py +206 -0
mlx_raclate/tuner/trainer.py +648 -0
mlx_raclate/tuner/utils.py +292 -0
mlx_raclate/utils/__init__.py +0 -0
mlx_raclate/utils/server.py +390 -0
mlx_raclate/utils/tokenizer_utils.py +353 -0
mlx_raclate/utils/train.py +249 -0
mlx_raclate/utils/utils.py +625 -0
mlx_raclate-0.1.0b1.dist-info/METADATA +216 -0
mlx_raclate-0.1.0b1.dist-info/RECORD +25 -0
mlx_raclate-0.1.0b1.dist-info/WHEEL +4 -0
mlx_raclate-0.1.0b1.dist-info/licenses/LICENSE +19 -0

mlx_raclate/tuner/datasets.py ADDED Viewed

@@ -0,0 +1,247 @@
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Any
+from datasets import load_dataset as hf_load_dataset
+from datasets import DatasetDict, ClassLabel, Sequence, Value
+from datasets import Dataset as HFDataset
+class DatasetArgs:
+    """
+    Arguments for dataset loading
+    If a remapping of column names is needed, specify the field names here.
+    main text : text_field
+    label / classification : label_field
+    text pair (optional for contrastive learning, sentence similarity or just sequence classification with 2 inputs) : text_pair_field
+    negative example (optional for triplet loss) : negative_field
+    """
+    def __init__(self, data: str, task_type: str,
+        text_field: Optional[str] = "text", label_field: Optional[str] = "label",
+        text_pair_field: Optional[str] = None, negative_field: Optional[str] = None, test: Optional[bool]=False
+    ):
+        self.data = data
+        self.task_type = task_type
+        self.text_field = text_field
+        self.label_field = label_field
+        self.text_pair_field = text_pair_field
+        self.negative_field = negative_field
+        self.test = test # whether to create a test set if not present
+def _standardize_column_names(dataset: HFDataset, args: DatasetArgs) -> HFDataset:
+    """
+    Renames columns to standard 'text', 'label', 'text_pair', 'negative' expected by collators.
+    Common mappings for various tasks:
+    - similarity : Anchor / Sentence A -> 'text'
+    - similarity : The Positive / Reference / Sentence B -> 'text_pair'
+    - similarity : The Hard Negative / Sentence C -> 'negative' (optional)
+    - similarity : Similarity score for Regression -> 'label' (optional)
+    Manual mappings can be specified via args usiing text_field, label_field, text_pair_field, negative_field.
+    text_field : column name for the main text input
+    label_field : column name for the label / score
+    text_pair_field (optional): column name for the paired text input / sentence B (used for cross-encoders or bi-encoders)
+    negative_field (optional): column name for the negative example (used for triplet training)
+    """
+    mapping = {}
+    # Manual field mappings
+    if args.text_field != "text" and args.text_field in dataset.column_names:
+        mapping[args.text_field] = "text"
+    if args.text_pair_field and args.text_pair_field != "text_pair" and args.text_pair_field in dataset.column_names:
+        mapping[args.text_pair_field] = "text_pair"
+    if args.label_field != "label" and args.label_field in dataset.column_names:
+        mapping[args.label_field] = "label"
+    if args.negative_field and args.negative_field != "negative" and args.negative_field in dataset.column_names:
+        mapping[args.negative_field] = "negative"
+    # Handle common alternative column names for text classification
+    if args.task_type == "sentence-similarity" or args.task_type == "sentence-transformers":
+         # handle Sequence classification : "sentence1" -> "text", "sentence2" -> "text_pair", "score" = "label"
+        if "sentence1" in dataset.column_names and "sentence2" in dataset.column_names and "score" in dataset.column_names:
+            mapping["sentence1"] = "text"
+            mapping["sentence2"] = "text_pair"
+            mapping["score"] = "label"
+        # Handle Anchor, Positives and Negatives for Triplet Training
+        if "anchor" in dataset.column_names and "positive" in dataset.column_names and "negative" in dataset.column_names:
+            mapping["anchor"] = "text"
+            mapping["positive"] = "text_pair"
+            mapping["negative"] = "negative"
+        if "pos" in dataset.column_names:
+            mapping["pos"] = "text_pair"
+        if "neg" in dataset.column_names:
+            mapping["neg"] = "negative"
+    # Handle Token Classification: usually "tokens" -> "text", "ner_tags" -> "labels"
+    if args.task_type == "token-classification":
+        if "tokens" in dataset.column_names and "text" not in mapping.values():
+             mapping["tokens"] = "text"
+        if "ner_tags" in dataset.column_names and "labels" not in mapping.values():
+             mapping["ner_tags"] = "labels"
+    if mapping:
+        dataset = dataset.rename_columns(mapping)
+    keep_columns = {"text", "text_pair", "label", "labels", "negative"}
+    existing_columns = set(dataset.column_names)
+    columns_to_select = list(keep_columns.intersection(existing_columns))
+    # Check if we have at least 'text'
+    if "text" not in columns_to_select:
+        print(f"Warning: Standard 'text' column not found in dataset columns: {dataset.column_names}")
+    dataset = dataset.select_columns(columns_to_select)
+    return dataset
+def get_label_mapping(dataset: HFDataset, args: DatasetArgs) -> Tuple[Optional[Dict[int, str]], Optional[Dict[str, int]]]:
+    """
+    Derives id2label and label2id from a dataset.
+    Prioritizes dataset features (from config), falls back to scanning unique values in data.
+    """
+    if args.task_type not in ["text-classification", "token-classification"]:
+        return None, None
+    # Determine the target column name based on task
+    target_col = "labels" if args.task_type == "token-classification" else "label"
+    if target_col not in dataset.column_names:
+        # Fallback: sometimes text-classification uses 'labels' or vice versa
+        if "label" in dataset.column_names: target_col = "label"
+        elif "labels" in dataset.column_names: target_col = "labels"
+        else: return None, None
+    labels = []
+    # Strategy 1: Check Features (Config/Hub Metadata) ---
+    feature = dataset.features[target_col]
+    # Case A: Standard ClassLabel (Text Classification)
+    if isinstance(feature, ClassLabel):
+        labels = feature.names
+    # Case B: Sequence of ClassLabels (Token Classification)
+    elif isinstance(feature, Sequence) and isinstance(feature.feature, ClassLabel):
+        labels = feature.feature.names
+    # Strategy 2: Scan Data (Raw JSONL/CSV) ---
+    if not labels:
+        if len(dataset) > 0:
+            if args.task_type == "token-classification":
+                # Flatten list of lists to find unique tags
+                unique_tags = set()
+                for row in dataset[target_col]:
+                    unique_tags.update(row)
+                labels = sorted(list(unique_tags))
+            else:
+                # Standard text classification scan
+                labels = sorted(list(set(dataset[target_col])))
+    if not labels:
+        return None, None
+    # Construct mappings
+    id2label = {k: str(v) for k, v in enumerate(labels)}
+    label2id = {str(v): k for k, v in enumerate(labels)}
+    return id2label, label2id
+def load_dataset(args: DatasetArgs) -> Tuple[Optional[HFDataset], Optional[HFDataset], Optional[HFDataset], Dict[str, int], Dict[int, str]]:
+    if not hasattr(args, "task_type"):
+        raise ValueError("Must specify task_type in args")
+    supported_tasks = ["text-classification", "masked-lm", "token-classification", "sentence-transformers", "sentence-similarity"]
+    if args.task_type not in supported_tasks:
+        raise ValueError(f"Unsupported task type: {args.task_type}")
+    # Load from Hub or Local
+    data_path = Path(args.data)
+    if data_path.exists():
+        # Detect format based on extension if it's a file, or assume structure if folder
+        if data_path.is_file():
+            # Single file loading
+            ext = data_path.suffix[1:] # remove dot
+            ext = "json" if ext == "jsonl" else ext
+            raw_datasets = hf_load_dataset(ext, data_files=str(data_path))
+            # If it loaded as 'train' only, we split later
+        else:
+            # It's a directory. Check for specific files.
+            data_files = {}
+            for split in ["train", "validation", "test"]:
+                for ext in ["jsonl", "json", "parquet", "csv"]:
+                    fname = f"{split}.{ext}"
+                    if (data_path / fname).exists():
+                        data_files[split] = str(data_path / fname)
+            if not data_files:
+                raise ValueError(f"No train/val/test files found in {data_path}")
+            # Determine loader type from first found file
+            first_file = list(data_files.values())[0]
+            ext = first_file.split(".")[-1]
+            ext = "json" if ext == "jsonl" else ext
+            raw_datasets = hf_load_dataset(ext, data_files=data_files)
+    else:
+        # Load from Hub
+        try:
+            raw_datasets = hf_load_dataset(args.data)
+        except Exception as e:
+            print(f"Failed to load as standard dataset: {e}. Trying simple load...")
+            raw_datasets = hf_load_dataset(args.data, split="train")
+            raw_datasets = DatasetDict({"train": raw_datasets})
+    if "train" not in raw_datasets:
+        raise ValueError("Training split not found in dataset")
+    # Handle Splits (Standard 70/15/15) or whatever the actual splits are
+    if "validation" not in raw_datasets and "test" not in raw_datasets:
+        if args.test:
+            t_t_split = raw_datasets["train"].train_test_split(test_size=0.15, seed=42)
+            raw_datasets["test"] = t_t_split["test"]
+            t_v_split = t_t_split["train"].train_test_split(test_size=0.176, seed=42)
+            raw_datasets["train"] = t_v_split["train"]
+            raw_datasets["validation"] = t_v_split["test"]
+        else : # create only validation split
+            t_v_split = raw_datasets["train"].train_test_split(test_size=0.176, seed=42)
+            raw_datasets["train"] = t_v_split["train"]
+            raw_datasets["validation"] = t_v_split["test"]
+    elif "validation" not in raw_datasets and "test" in raw_datasets:
+        if args.test:
+            t_v_split = raw_datasets["train"].train_test_split(test_size=0.176, seed=42)
+            raw_datasets["train"] = t_v_split["train"]
+            raw_datasets["validation"] = t_v_split["test"]
+        else : # use test split as validation split
+            raw_datasets["validation"] = raw_datasets["test"]
+            raw_datasets["test"] = None
+    elif "test" not in raw_datasets and args.test:
+        t_t_split = raw_datasets["train"].train_test_split(test_size=0.176, seed=42)
+        raw_datasets["train"] = t_t_split["train"]
+        raw_datasets["test"] = t_t_split["test"]
+    # Standardize Columns
+    for split in raw_datasets.keys():
+        if raw_datasets[split] is not None:
+            print(f"Standardizing columns for split '{split}' ({len(raw_datasets[split])} examples)...")
+            raw_datasets[split] = _standardize_column_names(raw_datasets[split], args)
+    # Get label mappings if applicable
+    id2label, label2id = None, None
+    if raw_datasets.get("train") is not None:
+        id2label, label2id = get_label_mapping(raw_datasets["train"], args)
+        if id2label:
+            print(f"Found {len(id2label)} labels. First 5: {list(id2label.values())[:5]}")
+    return (
+        raw_datasets.get("train"),
+        raw_datasets.get("validation"),
+        raw_datasets.get("test"),
+        id2label,
+        label2id
+    )

mlx_raclate/tuner/model_card_utils.py ADDED Viewed

@@ -0,0 +1,206 @@
+from typing import List, Optional
+import importlib
+# Pipeline to module mapping
+_PIPELINE_TO_MODULE = {
+    "text-classification": "text_classification",
+    "sentence-similarity": "sentence_similarity",
+    "sentence-transformers": "sentence_similarity",  # Same module, similar code
+    "embeddings": "embeddings",
+    "masked-lm": "masked_lm",
+    "zero-shot-classification": "zero_shot",
+}
+def get_inference_code(
+    pipeline: str,
+    model_path: str = "{{MODEL_PATH}}",
+    **kwargs,
+) -> str:
+    """
+    Get inference example code for a model card.
+    This function returns clean, runnable Python code that can be directly
+    used in HuggingFace model cards. The code comes from the same source
+    as the test suite, ensuring consistency.
+    Args:
+        pipeline: The pipeline type (e.g., "text-classification", "sentence-similarity")
+        model_path: The model path to use in the example. Use "{{MODEL_PATH}}" as a
+            placeholder if the actual path isn't known yet.
+        **kwargs: Additional arguments passed to the specific pipeline's get_example_code()
+            function. Common options:
+            - text: str - for masked-lm, zero-shot
+            - texts: List[str] - for text-classification, embeddings
+            - text_pairs: List[str] - for text-classification
+            - documents: List[str] - for text-classification
+            - queries: List[str] - for text-classification
+            - is_regression: bool - for text-classification
+            - use_late_interaction: bool - for sentence-similarity (ColBERT-style)
+            - label_candidates: List or Dict - for zero-shot
+    Returns:
+        Formatted Python code string ready for inclusion in a model card.
+    """
+    if pipeline not in _PIPELINE_TO_MODULE:
+        raise ValueError(
+            f"Unknown pipeline: {pipeline}. "
+            f"Supported pipelines: {list(_PIPELINE_TO_MODULE.keys())}"
+        )
+    module_name = _PIPELINE_TO_MODULE[pipeline]
+    # Try importing from tests.inference_examples first (development)
+    # Fall back to relative import if tests not available
+    try:
+        module = importlib.import_module(f"tests.inference_examples.{module_name}")
+    except ImportError:
+        # If running from within the library, try relative path
+        try:
+            import tests.inference_examples
+            module = getattr(tests.inference_examples, module_name)
+        except (ImportError, AttributeError):
+            raise ImportError(
+                f"Could not import inference example module for {pipeline}. "
+                "Make sure the tests package is installed or accessible."
+            )
+    # Call the module's get_example_code function
+    return module.get_example_code(model_path=model_path, **kwargs)
+def get_available_pipelines() -> List[str]:
+    """Get list of pipelines that have model card code templates."""
+    return list(_PIPELINE_TO_MODULE.keys())
+def generate_model_card_section(
+    pipeline: str,
+    model_path: str,
+    title: str = "Usage with mlx-raclate",
+    **kwargs,
+) -> str:
+    """
+    Generate a complete model card section with title and code block.
+    Args:
+        pipeline: The pipeline type
+        model_path: The model path
+        title: Section title
+        **kwargs: Additional arguments for get_inference_code()
+    Returns:
+        Markdown-formatted section for a model card
+    """
+    code = get_inference_code(pipeline=pipeline, model_path=model_path, **kwargs)
+    return f"""## {title}
+This model can be used with [mlx-raclate](https://github.com/pappitti/mlx-raclate) for native inference on Apple Silicon.
+```python
+{code}
+```
+"""
+def get_code_for_trained_model(
+    pipeline: str,
+    model_path: str,
+    base_model: str,
+    training_task: Optional[str] = None,
+    **kwargs,
+) -> str:
+    """
+    Generate model card content for a newly trained model.
+    This is intended to be called after training, to generate the
+    inference example code for the model card before uploading to HuggingFace.
+    Args:
+        pipeline: Pipeline the model was trained for
+        model_path: Path where the model will be uploaded (e.g., "my-org/my-model")
+        base_model: The base model used for training
+        training_task: Optional description of the training task
+        **kwargs: Additional arguments for the code template
+    Returns:
+        Complete markdown section for the model card
+    """
+    section = generate_model_card_section(
+        pipeline=pipeline,
+        model_path=model_path,
+        **kwargs,
+    )
+    # Add metadata about training
+    metadata = f"""
+### Model Details
+- **Base Model**: [{base_model}](https://huggingface.co/{base_model})
+- **Pipeline**: `{pipeline}`
+- **Framework**: [mlx-raclate](https://github.com/pappitti/mlx-raclate) (MLX)
+"""
+    if training_task:
+        metadata += f"- **Training Task**: {training_task}\n"
+    return section + metadata
+# ============================================================================
+# CLI INTERFACE
+# ============================================================================
+def main():
+    """CLI for generating model card code snippets."""
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Generate inference code for model cards"
+    )
+    parser.add_argument(
+        "pipeline",
+        choices=get_available_pipelines(),
+        help="Pipeline type"
+    )
+    parser.add_argument(
+        "--model-path",
+        default="{{MODEL_PATH}}",
+        help="Model path/ID for the example"
+    )
+    parser.add_argument(
+        "--late-interaction",
+        action="store_true",
+        help="Use late interaction for sentence-similarity"
+    )
+    parser.add_argument(
+        "--full-section",
+        action="store_true",
+        help="Generate full markdown section instead of just code"
+    )
+    args = parser.parse_args()
+    kwargs = {}
+    if args.late_interaction:
+        kwargs["use_late_interaction"] = True
+    if args.full_section:
+        output = generate_model_card_section(
+            pipeline=args.pipeline,
+            model_path=args.model_path,
+            **kwargs,
+        )
+    else:
+        output = get_inference_code(
+            pipeline=args.pipeline,
+            model_path=args.model_path,
+            **kwargs,
+        )
+    print(output)
+if __name__ == "__main__":
+    main()