PyPI - pydatamax - Versions diffs - 0.1.16.post1__tar.gz → 0.1.16.post2__tar.gz - Mend

pydatamax 0.1.16.post1tar.gz → 0.1.16.post2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

{pydatamax-0.1.16.post1/pydatamax.egg-info → pydatamax-0.1.16.post2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydatamax
-Version: 0.1.16.post1
+Version: 0.1.16.post2
 Summary: A library for parsing and converting various file formats.
 Home-page: https://github.com/Hi-Dolphin/datamax
 Author: ccy
@@ -113,7 +113,7 @@ qa_data = dm.get_pre_label(
     question_number=5,     # 每块生成问题数
     max_workers=5          # 并发数
 )
-dm.save_label_data(res)
+dm.save_label_data(qa_data)
 ```
 ## 📖 Detailed Documentation
@@ -316,6 +316,58 @@ pip install -r requirements.txt
 python setup.py install
 ```
+### Developer Mode
+For developers who want to contribute to the project or make modifications, we recommend using developer mode for a better development experience.
+#### Setup Developer Mode
+```bash
+# Clone the repository
+git clone https://github.com/Hi-Dolphin/datamax.git
+cd datamax
+# Create virtual environment (recommended)
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+# Install in developer mode
+pip install -e .
+```
+#### Benefits of Developer Mode
+- **Live Updates**: Changes to source code are immediately reflected without reinstallation
+- **Easy Testing**: Test your modifications instantly
+- **Debugging**: Better debugging experience with direct access to source code
+- **Development Workflow**: Seamless integration with your development environment
+#### Development Commands
+```bash
+# Run tests
+pytest
+# Install development dependencies
+pip install -r requirements-dev.txt  # if available
+# Check code style
+flake8 datamax/
+black datamax/
+# Build package
+python setup.py sdist bdist_wheel
+```
+#### Making Changes
+After installing in developer mode, you can:
+1. Edit source code in the `datamax/` directory
+2. Changes are automatically available when you import the module
+3. Test your changes immediately without reinstalling
+4. Submit pull requests with your improvements
 ## 📋 System Requirements
 - Python >= 3.10

{pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/README.md RENAMED Viewed

@@ -52,7 +52,7 @@ qa_data = dm.get_pre_label(
     question_number=5,     # 每块生成问题数
     max_workers=5          # 并发数
 )
-dm.save_label_data(res)
+dm.save_label_data(qa_data)
 ```
 ## 📖 Detailed Documentation
@@ -255,6 +255,58 @@ pip install -r requirements.txt
 python setup.py install
 ```
+### Developer Mode
+For developers who want to contribute to the project or make modifications, we recommend using developer mode for a better development experience.
+#### Setup Developer Mode
+```bash
+# Clone the repository
+git clone https://github.com/Hi-Dolphin/datamax.git
+cd datamax
+# Create virtual environment (recommended)
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+# Install in developer mode
+pip install -e .
+```
+#### Benefits of Developer Mode
+- **Live Updates**: Changes to source code are immediately reflected without reinstallation
+- **Easy Testing**: Test your modifications instantly
+- **Debugging**: Better debugging experience with direct access to source code
+- **Development Workflow**: Seamless integration with your development environment
+#### Development Commands
+```bash
+# Run tests
+pytest
+# Install development dependencies
+pip install -r requirements-dev.txt  # if available
+# Check code style
+flake8 datamax/
+black datamax/
+# Build package
+python setup.py sdist bdist_wheel
+```
+#### Making Changes
+After installing in developer mode, you can:
+1. Edit source code in the `datamax/` directory
+2. Changes are automatically available when you import the module
+3. Test your changes immediately without reinstalling
+4. Submit pull requests with your improvements
 ## 📋 System Requirements
 - Python >= 3.10

pydatamax-0.1.16.post2/datamax/loader/core.py ADDED Viewed

@@ -0,0 +1,144 @@
+import os
+from typing import List
+from datamax.loader.minio_handler import MinIOClient
+from datamax.loader.oss_handler import OssClient
+class DataLoader:
+    def __init__(
+        self,
+        endpoint: str = None,
+        secret_key: str = None,
+        access_key: str = None,
+        bucket_name: str = None,
+        source: str = None,
+    ):
+        if source and source == "Oss":
+            self.oss = OssClient(
+                oss_endpoint=endpoint,
+                oss_access_key_secret=secret_key,
+                oss_access_key_id=access_key,
+                oss_bucket_name=bucket_name,
+            )
+        elif source and source == "MinIO":
+            self.mi = MinIOClient(
+                endpoint=endpoint,
+                secret_key=secret_key,
+                access_key=access_key,
+                bucket_name=bucket_name,
+            )
+        self.download_path = str("./download_file")
+        self.source = source
+        self.bucket_name = bucket_name
+    @staticmethod
+    def load_from_file(local_file_path) -> List[str]:
+        if os.path.isfile(local_file_path):
+            if os.path.exists(local_file_path):
+                if os.access(local_file_path, os.R_OK):
+                    return [local_file_path]
+                else:
+                    return []
+            else:
+                return []
+        elif os.path.isdir(local_file_path):
+            access_path = []
+            # Recursively process all files and subdirectories under the current directory.
+            for item in os.listdir(local_file_path):
+                item_path = os.path.join(local_file_path, item)
+                item_results = DataLoader.load_from_file(item_path)
+                access_path.extend(item_results)
+            return access_path
+        else:
+            return []
+    def load_from_oss_source(self, oss_path: str) -> List[str]:
+        if not os.path.exists(self.download_path):
+            os.makedirs(self.download_path)
+        self.download(oss_path=oss_path)
+        file_list = []
+        for root, dirs, files in os.walk(self.download_path):
+            for file in files:
+                file_path = os.path.join(self.download_path, file)
+                file_list.append(file_path)
+        success_file_list = []
+        for file_path in file_list:
+            if self.load_from_file(file_path):
+                success_file_list.append(file_path)
+        return success_file_list
+    def download(self, oss_path: str):
+        if self.source == "MinIO":
+            file_list = self.mi.list_objects(
+                bucket_name=self.bucket_name, prefix=oss_path
+            )
+            for path in file_list:
+                self.mi.download_file(
+                    bucket_name=self.bucket_name,
+                    object_name=path,
+                    file_path=f'{self.download_path}/{path.split("/")[-1]}',
+                )
+        elif self.source == "Oss":
+            keys = self.oss.get_objects_in_folders(prefix=oss_path)
+            for path in keys:
+                self.oss.get_object_to_file(
+                    object_name=path,
+                    file_path=f'{self.download_path}/{path.split("/")[-1]}',
+                )
+    def upload(self, local_file_path: str, save_prefix: str):
+        if self.source == "MinIO":
+            if os.path.isdir(local_file_path):
+                for root, dirs, files in os.walk(local_file_path):
+                    for file in files:
+                        file_path = os.path.join(root, file)
+                        self.mi.upload_file(
+                            bucket_name=self.bucket_name,
+                            object_name=save_prefix + f"{file}",
+                            file_path=file_path,
+                        )
+            elif os.path.isfile(local_file_path):
+                self.mi.upload_file(
+                    bucket_name=self.bucket_name,
+                    object_name=save_prefix + os.path.basename(local_file_path),
+                    file_path=local_file_path,
+                )
+            else:
+                pass
+        elif self.source == "Oss":
+            if os.path.isdir(local_file_path):
+                self.oss.put_object_from_folder(
+                    object_folder_name=save_prefix, local_folder_path=local_file_path
+                )
+            elif os.path.isfile(local_file_path):
+                self.oss.put_object_from_file(
+                    object_name=save_prefix + os.path.basename(local_file_path),
+                    file_path=local_file_path,
+                )
+        else:
+            pass
+    def share(
+        self,
+        oss_path: str,
+        expires: int = None,
+        aliyun_oss_url_prefix: str = None,
+        csnt_url_prefix: str = None,
+    ):
+        if self.source == "MinIO":
+            return self.mi.get_object_tmp_link(
+                bucket_name=self.bucket_name, object_name=oss_path, expires=expires
+            )
+        elif self.source == "Oss":
+            return self.oss.get_oss_url(
+                object_name=oss_path,
+                url_expires_time=expires,
+                aliyun_oss_url_prefix=aliyun_oss_url_prefix,
+                csnt_url_prefix=csnt_url_prefix,
+            )

{pydatamax-0.1.16.post1 → pydatamax-0.1.16.post2}/datamax/loader/minio_handler.py RENAMED Viewed

@@ -1,11 +1,12 @@
 import os
-from dotenv import load_dotenv
+import re
 from datetime import timedelta
+from dotenv import load_dotenv
+from loguru import logger
 from minio import Minio
 from minio.commonconfig import Tags
 from minio.error import S3Error
-from loguru import logger
-import re
 load_dotenv()
@@ -25,7 +26,7 @@ class MinIOClient:
                 self.endpoint,
                 access_key=self.access_key,
                 secret_key=self.secret_key,
-                secure=self.secure
+                secure=self.secure,
             )
             return client
         except S3Error as e:
@@ -55,7 +56,9 @@ class MinIOClient:
         if self.client:
             try:
                 self.client.fput_object(bucket_name, object_name, file_path)
-                logger.info(f"File '{file_path}' uploaded to bucket '{bucket_name}' as '{object_name}'.")
+                logger.info(
+                    f"File '{file_path}' uploaded to bucket '{bucket_name}' as '{object_name}'."
+                )
             except S3Error as e:
                 raise
@@ -63,15 +66,18 @@ class MinIOClient:
         if self.client:
             try:
                 self.client.fget_object(bucket_name, object_name, file_path)
-                logger.info(f"Object '{object_name}' from bucket '{bucket_name}' downloaded to '{file_path}'.")
+                logger.info(
+                    f"Object '{object_name}' from bucket '{bucket_name}' downloaded to '{file_path}'."
+                )
                 return file_path
             except Exception as e:
                 try:
                     illegal_chars = r'[\/:*?"<>|]'
-                    file_path = re.sub(illegal_chars, '_', file_path)
+                    file_path = re.sub(illegal_chars, "_", file_path)
                     self.client.fget_object(bucket_name, object_name, file_path)
                     logger.info(
-                        f"Object {object_name} from bucket {bucket_name} downloaded to {file_path}'.")
+                        f"Object {object_name} from bucket {bucket_name} downloaded to {file_path}'."
+                    )
                     return file_path
                 except Exception as e:
                     raise
@@ -81,7 +87,9 @@ class MinIOClient:
             try:
                 result_list = []
                 if prefix:
-                    objects = self.client.list_objects(bucket_name, recursive=True, prefix=prefix)
+                    objects = self.client.list_objects(
+                        bucket_name, recursive=True, prefix=prefix
+                    )
                 else:
                     objects = self.client.list_objects(bucket_name, recursive=True)
                 logger.info(f"Objects in bucket '{bucket_name}':")
@@ -99,8 +107,7 @@ class MinIOClient:
                 raise
     def calculate_bucket_stats(self, bucket_name, prefix):
-        objects = self.client.list_objects(bucket_name,
-                                           prefix=prefix, recursive=True)
+        objects = self.client.list_objects(bucket_name, prefix=prefix, recursive=True)
         total_size = 0
         object_count = 0
@@ -115,14 +122,16 @@ class MinIOClient:
     def get_objects(self, bucket_name, object_name):
         try:
             response = self.client.get_object(bucket_name, object_name)
-            content = response.read().decode('utf-8')
+            content = response.read().decode("utf-8")
             return content
         except Exception as e:
             raise
     def get_object_tag(self, bucket_name, object_name):
         try:
-            tags = self.client.get_object_tags(bucket_name=bucket_name, object_name=object_name)
+            tags = self.client.get_object_tags(
+                bucket_name=bucket_name, object_name=object_name
+            )
             return tags
         except Exception as e:
             raise
@@ -130,7 +139,9 @@ class MinIOClient:
     def update_object_tag(self, bucket_name, object_name, tags):
         try:
             tags_obj = Tags.new_object_tags()
-            tag_info = self.get_object_tag(bucket_name=bucket_name, object_name=object_name)
+            tag_info = self.get_object_tag(
+                bucket_name=bucket_name, object_name=object_name
+            )
             if tag_info is None:
                 tag_info = {}
                 for tag_dict in tags:
@@ -142,7 +153,9 @@ class MinIOClient:
                 for k, v in tag_info.items():
                     tags_obj[k] = v
-                self.client.set_object_tags(bucket_name=bucket_name, object_name=object_name, tags=tags_obj)
+                self.client.set_object_tags(
+                    bucket_name=bucket_name, object_name=object_name, tags=tags_obj
+                )
             else:
                 for tag_dict in tags:
                     for tag_key, tag_value in tag_dict.items():
@@ -153,20 +166,26 @@ class MinIOClient:
                 for k, v in tag_info.items():
                     tags_obj[k] = v
-                self.client.set_object_tags(bucket_name=bucket_name, object_name=object_name, tags=tags_obj)
+                self.client.set_object_tags(
+                    bucket_name=bucket_name, object_name=object_name, tags=tags_obj
+                )
             return tag_info
         except Exception as e:
             raise
     def reset_object_tag(self, bucket_name, object_name):
         try:
-            self.client.delete_object_tags(bucket_name=bucket_name, object_name=object_name)
+            self.client.delete_object_tags(
+                bucket_name=bucket_name, object_name=object_name
+            )
             return True
         except Exception as e:
             raise
     def get_object_tmp_link(self, bucket_name, object_name, expires):
         try:
-            return self.client.presigned_get_object(bucket_name, object_name, expires=timedelta(days=expires))
+            return self.client.presigned_get_object(
+                bucket_name, object_name, expires=timedelta(days=expires)
+            )
         except Exception as e:
-            raise
+            raise

pydatamax-0.1.16.post2/datamax/parser/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from loguru import logger
+from .core import DataMax

pydatamax-0.1.16.post2/datamax/parser/base.py ADDED Viewed

@@ -0,0 +1,101 @@
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Union
+from datamax.utils.lifecycle_types import LifeType
+from datamax.utils.tokenizer import DashScopeClient
+class LifeCycle:
+    """
+    Life cycle class
+    """
+    def __init__(
+        self, update_time: str, life_type: list, life_metadata: Dict[str, str]
+    ):
+        self.update_time = update_time  # Update time
+        self.life_type = life_type  # Life cycle type
+        self.life_metadata = life_metadata  # Life cycle metadata
+    def update(self, update_time: str, life_type: list, life_metadata: Dict[str, str]):
+        self.update_time = update_time
+        self.life_type = life_type
+        self.life_metadata.update(life_metadata)
+    def __str__(self):
+        metadata_str = ", ".join(f"{k}: {v}" for k, v in self.life_metadata.items())
+        return f"update_time: {self.update_time}, life_type: {self.life_type}, life_metadata: {{{metadata_str}}}"
+    def to_dict(self):
+        return {
+            "update_time": self.update_time,
+            "life_type": self.life_type,
+            "life_metadata": self.life_metadata,
+        }
+class MarkdownOutputVo:
+    """
+    Markdown output conversion
+    """
+    def __init__(self, extension: str, content: str):
+        self.extension: str = extension  # File type
+        self.content: str = content  # Markdown content
+        self.lifecycle: List[LifeCycle] = []  # Life cycle data
+    def add_lifecycle(self, lifecycle: LifeCycle):
+        self.lifecycle.append(lifecycle)
+    def to_dict(self):
+        data_dict = {
+            "extension": self.extension,
+            "content": self.content,
+            "lifecycle": [lc.to_dict() for lc in self.lifecycle],
+        }
+        return data_dict
+class BaseLife:
+    tk_client = DashScopeClient()
+    @staticmethod
+    def generate_lifecycle(
+        source_file: str,
+        domain: str,
+        life_type: Union[LifeType, str, List[Union[LifeType, str]]],
+        usage_purpose: str,
+    ) -> LifeCycle:
+        """
+        构造一个 LifeCycle 记录，可以传入单个枚举/字符串或列表混合
+        """
+        # 1) 先统一成 list
+        if isinstance(life_type, (list, tuple)):
+            raw = list(life_type)
+        else:
+            raw = [life_type]
+        # 2) 如果是枚举，就取它的 value
+        life_list: List[str] = [
+            lt.value if isinstance(lt, LifeType) else lt for lt in raw
+        ]
+        update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        try:
+            storage = os.path.getsize(source_file)
+        except Exception:
+            storage = 0
+        life_metadata = {
+            "storage_size": storage,
+            "source_file": source_file,
+            "domain": domain,
+            "usage_purpose": usage_purpose,
+        }
+        return LifeCycle(update_time, life_list, life_metadata)
+    @staticmethod
+    def get_file_extension(file_path):
+        file_path = Path(file_path)
+        return file_path.suffix[1:].lower()

pydatamax 0.1.16.post1__tar.gz → 0.1.16.post2__tar.gz

pydatamax 0.1.16.post1tar.gz → 0.1.16.post2tar.gz