PyPI - orbitkit - Versions diffs - 0.8.47__tar.gz → 0.8.48__tar.gz - Mend

orbitkit 0.8.47tar.gz → 0.8.48tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

{orbitkit-0.8.47/orbitkit.egg-info → orbitkit-0.8.48}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: orbitkit
-Version: 0.8.47
+Version: 0.8.48
 Summary: This project is only for Orbit Tech internal use.
 Home-page: https://github.com/clown-0726/orbitkit
 Author: Lilu Cao

orbitkit-0.8.48/orbitkit/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.8.48

orbitkit-0.8.48/orbitkit/orbit_type/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .orbit_type_simple import OrbitTypeMatcher, OrbitTypeMatcherAsync

{orbitkit-0.8.47 → orbitkit-0.8.48}/orbitkit/orbit_type/orbit_type_simple.py RENAMED Viewed

@@ -2,9 +2,10 @@ import abc
 import json
 import logging
 import re
-from typing import List, Literal
+from typing import List, Literal, Optional
 from botocore.exceptions import ClientError
 import boto3
+import aioboto3
 logger = logging.getLogger(__name__)
@@ -459,6 +460,281 @@ class OrbitTypeMatcher:
             }
+class OrbitTypeMatcherAsync:
+    """
+    OrbitTypeMatcher 的异步版本
+    使用 aioboto3 进行异步 S3 操作，提升性能
+    """
+    def __init__(
+        self,
+        aws_access_key_id: Optional[str] = None,
+        aws_secret_access_key: Optional[str] = None,
+        file_bucket: str = 'ot-cdn',
+        key_prefix: str = 'orbit-typing/typing-prod'
+    ):
+        """
+        初始化异步类型匹配器
+        :param aws_access_key_id: AWS access key ID（可选，不提供则使用默认凭证链）
+        :param aws_secret_access_key: AWS secret access key（可选）
+        :param file_bucket: S3 bucket 名称
+        :param key_prefix: S3 key 前缀
+        """
+        # 创建 aioboto3 session
+        if aws_access_key_id and aws_secret_access_key:
+            self.session = aioboto3.Session(
+                aws_access_key_id=aws_access_key_id,
+                aws_secret_access_key=aws_secret_access_key
+            )
+        else:
+            # 使用默认凭证链（环境变量、AWS CLI 配置、IAM 角色等）
+            self.session = aioboto3.Session()
+        self.file_bucket = file_bucket
+        self.key_prefix = key_prefix
+        self.matcher = None
+        self.json_file = None
+        self.version = None
+    @classmethod
+    async def create(
+        cls,
+        aws_access_key_id: Optional[str] = None,
+        aws_secret_access_key: Optional[str] = None,
+        file_bucket: str = 'ot-cdn',
+        key_prefix: str = 'orbit-typing/typing-prod',
+        version: Optional[str] = None
+    ):
+        """
+        异步工厂方法，用于创建已初始化的 OrbitTypeMatcherAsync 实例
+        :param aws_access_key_id: AWS access key ID（可选）
+        :param aws_secret_access_key: AWS secret access key（可选）
+        :param file_bucket: S3 bucket 名称
+        :param key_prefix: S3 key 前缀
+        :param version: 类型版本号（可选，不提供则使用最新版本）
+        :return: 已初始化的 OrbitTypeMatcherAsync 实例
+        示例：
+            matcher = await OrbitTypeMatcherAsync.create(version='0.3.4')
+            result = matcher.match_type(title='Press Release')
+        """
+        instance = cls(
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key,
+            file_bucket=file_bucket,
+            key_prefix=key_prefix
+        )
+        # 异步加载版本和配置文件
+        if not version:
+            version = await instance.get_newest_version()
+        source_key = f'{key_prefix}/{version}.json'
+        json_file = await instance.read_s3_file(
+            bucket_name=file_bucket,
+            file_name=source_key
+        )
+        if not json_file:
+            raise Exception(f'该 S3 文件不存在: {source_key}')
+        instance.json_file = json_file
+        instance.matcher = L3RuleListMatcher(json_file)
+        instance.version = version
+        return instance
+    async def read_s3_file(self, bucket_name: str, file_name: str):
+        """
+        异步读取 S3 中的 JSON 文件
+        :param bucket_name: S3 bucket 名称
+        :param file_name: 文件路径
+        :return: JSON 数据或 None
+        """
+        try:
+            async with self.session.client('s3') as s3_client:
+                file_obj = await s3_client.get_object(Bucket=bucket_name, Key=file_name)
+                content = await file_obj['Body'].read()
+                lines = content.decode('utf-8')
+                json_data = json.loads(lines)
+                return json_data
+        except ClientError as e:
+            logger.error(f"读取 S3 文件失败: {bucket_name}/{file_name}, 错误: {e}")
+            return None
+    async def get_newest_version(self):
+        """
+        异步获取最新的类型版本号
+        :return: 最新版本号字符串
+        """
+        async with self.session.client('s3') as s3_client:
+            response = await s3_client.list_objects_v2(
+                Bucket=self.file_bucket,
+                Prefix=self.key_prefix
+            )
+            # 遍历存储桶中的对象，获取文件名
+            file_names = []
+            for obj in response.get('Contents', []):
+                key = obj['Key']
+                if key.endswith('/'):  # 排除文件夹本身
+                    continue
+                file_names.append(key.split('/')[-1])  # 获取文件名部分
+            result = []
+            for key in file_names:
+                if 'tmp.json' in key:
+                    continue
+                if '.json' not in key:
+                    continue
+                result.append(key.replace('.json', ''))
+            def sort_vision_json_file(filename):
+                parts = filename.replace('.json', '').split('.')
+                return tuple(int(part) for part in parts)
+            sorted_versions = sorted(result, key=sort_vision_json_file, reverse=True)
+            return sorted_versions[0]
+    def get_full_type_list(self):
+        """
+        获取完整包含 1、2、3 级类型的列表
+        注意：此方法是同步的，因为它只进行内存操作
+        :return: 完整类型列表
+        """
+        lv1_list = self.json_file['lv1_list']
+        lv1_dict = {}
+        for row in lv1_list:
+            lv1_id = row['lv1_id']
+            name = row['lv1_name']
+            lv1_dict[lv1_id] = name
+        lv2_list = self.json_file['lv2_list']
+        lv2_dict = {}
+        for row in lv2_list:
+            lv1_id = row['lv1_id']
+            lv2_id = row['lv2_id']
+            name = row['lv2_name']
+            lv2_dict[lv2_id] = {
+                'lv2_id': lv2_id,
+                'lv2_name': name,
+                'lv1_id': lv1_id,
+                'lv1_name': lv1_dict[lv1_id]
+            }
+        lv3_list = self.json_file['lv3_list']
+        lv3_data = []
+        for row in lv3_list:
+            lv2_id = row['level_2_id']
+            this_dict = {
+                'lv3_id': row['lv3_id'],
+                'lv3_name': row['lv3_name']
+            }
+            this_dict.update(lv2_dict[lv2_id])
+            lv3_data.append(this_dict)
+        sorted_list = sorted(
+            lv3_data,
+            key=lambda x: (int(x['lv3_id']), int(x['lv2_id']), int(x['lv1_id']))
+        )
+        return sorted_list
+    def match_type(self, match_flag: Literal['in_order', 'match_all'] = 'in_order', **match_kwargs):
+        """
+        用于匹配的函数，可以传入 n 个匹配参数
+        注意：此方法是同步的，因为匹配操作只涉及内存操作，不需要 I/O
+        :param match_flag: 匹配模式
+            - 'in_order': 按顺序依次匹配，匹配到结果就停止
+            - 'match_all': 匹配全部的项，并将结果合并
+        :param match_kwargs: 匹配参数（如 title='...', url='...', description='...'）
+        :return: 匹配结果
+        示例：
+            # 按顺序匹配
+            key, result = matcher.match_type(
+                match_flag='in_order',
+                title='Press Release',
+                description='Company news'
+            )
+            # 匹配所有
+            result = matcher.match_type(
+                match_flag='match_all',
+                title='Press Release',
+                url='https://example.com/news/press-release'
+            )
+        """
+        if match_flag not in ['in_order', 'match_all']:
+            raise ValueError('match_flag 参数必须是 "in_order" 或 "match_all"')
+        if not match_kwargs:
+            raise ValueError('必须传入匹配关键词！')
+        default_result = [{'lv3_id': '19999', 'lv3_name': 'Miscellaneous'}]
+        def match_url(url: str):
+            split_url = url.replace('http://', '').replace('https://', '').split('/')
+            split_url = [x.strip().replace('%20', ' ') for x in split_url if len(x.strip()) > 4]
+            for part in reversed(split_url):
+                match_result = self.matcher.start_match_all(part)
+                if match_result:
+                    match_key = f'url#{part}'
+                    return match_key, match_result
+            return None, None
+        if match_flag == 'in_order':
+            for key, value in match_kwargs.items():
+                if key == 'url':
+                    url_key, match_result = match_url(value)
+                    if match_result:
+                        return url_key, match_result
+                else:
+                    match_result = self.matcher.start_match_all(value)
+                    if match_result:
+                        return key, match_result
+            return None, default_result
+        elif match_flag == 'match_all':
+            overall_results = []
+            match_detail = {}
+            for key, value in match_kwargs.items():
+                if key == 'url':
+                    url_key, match_result = match_url(value)
+                    if match_result:
+                        overall_results += match_result
+                        match_detail[url_key] = match_result
+                else:
+                    match_result = self.matcher.start_match_all(value)
+                    if match_result:
+                        overall_results += match_result
+                        match_detail[key] = match_result
+            if not overall_results:
+                overall_results = default_result
+            unique_dict = {d['lv3_id']: d for d in overall_results}
+            sorted_results = sorted(unique_dict.values(), key=lambda x: int(x['lv3_id']))
+            return {
+                'results': sorted_results,
+                'match_detail': match_detail
+            }
 if __name__ == '__main__':
+    # 同步版本示例
     matcher = OrbitTypeMatcher(version='0.2.1')
     matcher.match_type(title='asdf')
+    # 异步版本示例
+    # import asyncio
+    # async def test_async():
+    #     matcher = await OrbitTypeMatcherAsync.create(version='0.3.4')
+    #     result = matcher.match_type(match_flag='in_order', title='Press Release')
+    #     print(result)
+    # asyncio.run(test_async())

{orbitkit-0.8.47 → orbitkit-0.8.48/orbitkit.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: orbitkit
-Version: 0.8.47
+Version: 0.8.48
 Summary: This project is only for Orbit Tech internal use.
 Home-page: https://github.com/clown-0726/orbitkit
 Author: Lilu Cao