PyPI - nexaai - Versions diffs - 1.0.4rc10__py3-none-macosx_11_0_arm64.whl - Mend

nexaai 1.0.4rc10__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nexaai might be problematic. Click here for more details.

Files changed (519) hide show

nexaai/mlx_backend/cv/generate.py ADDED Viewed

@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+import os
+import sys
+import time
+import math
+from pathlib import Path
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+from .modeling.pp_ocr_v4 import Config, TextSystem
+def is_image_file(file_path):
+    """Check if file is an image based on extension."""
+    img_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".gif", ".rgb"}
+    return Path(file_path).suffix.lower() in img_extensions
+def get_image_file_list(img_file):
+    """Get list of image files from a directory or single file."""
+    imgs_lists = []
+    if img_file is None or not os.path.exists(img_file):
+        raise Exception("not found any img file in {}".format(img_file))
+    if os.path.isfile(img_file) and is_image_file(img_file):
+        imgs_lists.append(img_file)
+    elif os.path.isdir(img_file):
+        for single_file in os.listdir(img_file):
+            file_path = os.path.join(img_file, single_file)
+            if is_image_file(file_path):
+                imgs_lists.append(file_path)
+    if len(imgs_lists) == 0:
+        raise Exception("not found any img file in {}".format(img_file))
+    return imgs_lists
+def check_and_read_gif(img_path):
+    """Check if image is gif and read it properly."""
+    if os.path.basename(img_path)[-3:] in ["gif", "GIF"]:
+        gif = cv2.VideoCapture(img_path)
+        ret, frame = gif.read()
+        if not ret:
+            print("Cannot read {}. This gif image maybe corrupted.".format(img_path))
+            return None, False
+        if len(frame.shape) == 2 or frame.shape[-1] == 1:
+            frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
+        imgvalue = frame[:, :, ::-1]
+        return imgvalue, True
+    return None, False
+def draw_ocr_box_txt(
+    image, boxes, txts, scores=None, drop_score=0.5, font_path="./doc/simfang.ttf"
+):
+    """Draw OCR results with boxes and text."""
+    h, w = image.height, image.width
+    img_left = image.copy()
+    img_right = Image.new("RGB", (w, h), (255, 255, 255))
+    import random
+    random.seed(0)
+    draw_left = ImageDraw.Draw(img_left)
+    draw_right = ImageDraw.Draw(img_right)
+    for idx, (box, txt) in enumerate(zip(boxes, txts)):
+        if scores is not None and scores[idx] < drop_score:
+            continue
+        color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
+        draw_left.polygon(box, fill=color)
+        draw_right.polygon(
+            [
+                box[0][0],
+                box[0][1],
+                box[1][0],
+                box[1][1],
+                box[2][0],
+                box[2][1],
+                box[3][0],
+                box[3][1],
+            ],
+            outline=color,
+        )
+        box_height = math.sqrt((box[0][0] - box[3][0]) ** 2 + (box[0][1] - box[3][1]) ** 2)
+        box_width = math.sqrt((box[0][0] - box[1][0]) ** 2 + (box[0][1] - box[1][1]) ** 2)
+        if box_height > 2 * box_width:
+            font_size = max(int(box_width * 0.9), 10)
+            try:
+                font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
+            except:
+                font = ImageFont.load_default()
+            cur_y = box[0][1]
+            for c in txt:
+                try:
+                    bbox = font.getbbox(c)
+                    char_size = (bbox[2] - bbox[0], bbox[3] - bbox[1])
+                except:
+                    char_size = (font_size, font_size)
+                draw_right.text((box[0][0] + 3, cur_y), c, fill=(0, 0, 0), font=font)
+                cur_y += char_size[1]
+        else:
+            font_size = max(int(box_height * 0.8), 10)
+            try:
+                font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
+            except:
+                font = ImageFont.load_default()
+            draw_right.text([box[0][0], box[0][1]], txt, fill=(0, 0, 0), font=font)
+    img_left = Image.blend(image, img_left, 0.5)
+    img_show = Image.new("RGB", (w * 2, h), (255, 255, 255))
+    img_show.paste(img_left, (0, 0, w, h))
+    img_show.paste(img_right, (w, 0, w * 2, h))
+    return np.array(img_show)
+def load_model():
+    """Load OCR model and return config and text system."""
+    config = Config()
+    ocr_system = TextSystem(config)
+    return config, ocr_system
+def process_folder(config, ocr_system):
+    """Process all images in the configured folder."""
+    img_paths = get_image_file_list(config.image_dir)
+    if not img_paths:
+        print("[ERR] No images found in", config.image_dir)
+        return
+    out_root = Path(config.base_dir) / "output"
+    txt_dir = out_root / "inference_txt"
+    vis_dir = out_root / "inference_results"
+    txt_dir.mkdir(parents=True, exist_ok=True)
+    vis_dir.mkdir(parents=True, exist_ok=True)
+    font = config.vis_font_path
+    total = 0.0
+    for idx, p in enumerate(img_paths, 1):
+        img, is_gif = check_and_read_gif(p)
+        if not is_gif:
+            img = cv2.imread(p)
+        if img is None:
+            print(f"[WARN] skip {p}")
+            continue
+        t0 = time.time()
+        boxes, recs = ocr_system(img)
+        dt = time.time() - t0
+        total += dt
+        name = Path(p).stem
+        with open(txt_dir / f"{name}.txt", "w", encoding="utf-8") as f:
+            f.writelines(f"{txt}\n" for txt, sc in recs) # DO NOT write confidence score in txt file
+        vis = draw_ocr_box_txt(
+            Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)),
+            boxes,
+            [t for t, _ in recs],
+            [s for _, s in recs],
+            drop_score=config.drop_score,
+            font_path=font,
+        )
+        cv2.imwrite(str(vis_dir / f"{name}.jpg"), vis[:, :, ::-1])
+        print(f"[{idx}/{len(img_paths)}] {Path(p).name}  boxes={len(boxes)}  time={dt:.3f}s")
+    print(f"\nDone {len(img_paths)} images in {total:.2f}s  (avg {total/len(img_paths):.3f}s)")
+def main():
+    """Main function to demonstrate OCR functionality."""
+    print("📥 Loading OCR model...")
+    # Load model and config
+    config, ocr_system = load_model()
+    print("✅ OCR model loaded successfully!")
+    print(f"📂 Processing images from: {config.image_dir}")
+    print("="*50)
+    # Process images
+    process_folder(config, ocr_system)
+if __name__ == "__main__":
+    main()

nexaai/mlx_backend/cv/interface.py ADDED Viewed

@@ -0,0 +1,151 @@
+# Copyright © Nexa AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import os
+import json
+import time
+import cv2
+import numpy as np
+from pathlib import Path
+from typing import Any, List, Optional, Sequence, Tuple, Union
+from PIL import Image
+from dataclasses import dataclass
+# Import necessary modules
+import mlx.core as mx
+# Import from ml.py for API alignment
+from ml import (
+    CVModel as BaseCVModel,
+    CVModelConfig,
+    CVResults,
+    CVResult,
+    CVCapabilities,
+    Path as PathType,
+)
+# Import the model implementation
+from .modeling.pp_ocr_v4 import Config, TextSystem
+@dataclass
+class CVConfig:
+    """Configuration for CV processing."""
+    batch_size: int = 1
+    drop_score: float = 0.5
+    font_path: Optional[str] = None
+    def __init__(
+        self,
+        batch_size: int = 1,
+        drop_score: float = 0.5,
+        font_path: Optional[str] = None,
+    ) -> None:
+        self.batch_size = batch_size
+        self.drop_score = drop_score
+        self.font_path = font_path
+class CVModel(BaseCVModel):
+    """
+    CV Model interface for MLX OCR models.
+    API aligned with ml.py CVModel abstract base class.
+    """
+    def __init__(
+        self,
+        config: CVModelConfig,
+        device: Optional[str] = None,
+    ) -> None:
+        super().__init__(config, device)
+        # print(f"config: {config}")
+        # TODO: this hack is to support local model path
+        # hack only support pp_ocr_v4
+        rec_model_path_str = str(config.rec_model_path) if config.rec_model_path else None
+        model_cache_dir = os.path.dirname(rec_model_path_str) if rec_model_path_str else None
+        # print(f"model_cache_dir: {model_cache_dir}")
+        cfg = Config(model_cache_dir)
+        # print(f"cfg: {cfg}")
+        cfg.device = self.device
+        self.ocr_system = TextSystem(cfg)
+    def destroy(self) -> None:
+        """Destroy the model and free resources."""
+        self.ocr_system = None
+        self.config = None
+    def close(self) -> None:
+        """Close the model."""
+        self.destroy()
+    def infer(self, input_image_path: str, clear_cache: bool = True) -> CVResults:
+        """Perform inference on image."""
+        if self.ocr_system is None:
+            raise RuntimeError("Model not loaded. Call load_model() first.")
+        # Load image
+        img = self._load_image(input_image_path)
+        if img is None:
+            raise ValueError(f"Failed to load image: {input_image_path}")
+        # Process with OCR
+        boxes, recs = self.ocr_system(img)
+        if clear_cache:
+            mx.clear_cache()
+        # Convert to CVResults format
+        results = []
+        for box, (text, score) in zip(boxes, recs):
+            # Create CVResult
+            result = CVResult(
+                text=text,
+                confidence=score,
+                # Note: OCR doesn't use bounding boxes in the same way as detection models
+                # but we can store the box coordinates if needed
+            )
+            results.append(result)
+        return CVResults(results=results, result_count=len(results))
+    def _load_image(self, image_path: Union[str, PathType]) -> Optional[np.ndarray]:
+        """Load image from path."""
+        try:
+            # Check if it's a GIF
+            if str(image_path).lower().endswith('.gif'):
+                gif = cv2.VideoCapture(str(image_path))
+                ret, frame = gif.read()
+                if not ret:
+                    return None
+                if len(frame.shape) == 2 or frame.shape[-1] == 1:
+                    frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
+                return frame[:, :, ::-1]  # BGR to RGB
+            else:
+                img = cv2.imread(str(image_path))
+                if img is None:
+                    return None
+                return img
+        except Exception as e:
+            print(f"Error loading image {image_path}: {e}")
+            return None
+def create_cv_model(
+    config: CVModelConfig,
+    device: Optional[str] = None,
+) -> CVModel:
+    """Create a CV model instance."""
+    return CVModel(config, device)

nexaai/mlx_backend/cv/main.py ADDED Viewed

@@ -0,0 +1,81 @@
+# Copyright © Nexa AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from .interface import create_cv_model, CVModelConfig
+def test_cv_model(model_path, test_image_path):
+    """Test CV model functionality."""
+    # Create CVModelConfig
+    config = CVModelConfig(
+        capabilities=0,  # ML_CV_OCR
+        model_path=model_path,
+        system_library_path=None,
+        backend_library_path=None,
+        extension_library_path=None,
+        config_file_path=None,
+        char_dict_path=None
+    )
+    model = create_cv_model(config)
+    print("✅ Model loaded successfully!")
+    # Test images (you can replace these with actual image paths)
+    test_images = [
+        "cv/modeling/input/20250406-170821.jpeg",
+        "cv/modeling/input/20250406-170838.jpeg",
+        "cv/modeling/input/20250406-170906.jpeg",
+        "cv/modeling/input/20250407-154044.jpeg",
+        "cv/modeling/input/20250407-154059.jpeg"
+    ] if test_image_path is None else [test_image_path]
+    for img_path in test_images:
+        if not os.path.exists(img_path):
+            print(f"❌ Image file not found: {img_path}")
+            continue
+        results = model.infer(img_path)
+        print(f"✅ OCR Results for {img_path}:")
+        print("=" * 50)
+        if results.result_count == 0:
+            print("No text detected in the image.")
+        else:
+            print(f"Found {results.result_count} text regions:")
+            for i, result in enumerate(results.results):
+                print(f"\nRegion {i+1}:")
+                print(f"  Text: '{result.text}'")
+                print(f"  Confidence: {result.confidence:.3f}")
+    print("\n✅ CV model test completed!")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Test CV processor functionality")
+    parser.add_argument("--model_path", type=str, default="nexaml/paddle-ocr-mlx",
+                       help="Path to the CV model")
+    parser.add_argument("--image_path", type=str, default=None,
+                       help="Path to a specific image to process")
+    parser.add_argument("--test_mode", action="store_true",
+                       help="Run in test mode with sample images")
+    args = parser.parse_args()
+    test_cv_model(args.model_path, args.image_path)