orbitkit 0.8.46__tar.gz → 0.8.48__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {orbitkit-0.8.46/orbitkit.egg-info → orbitkit-0.8.48}/PKG-INFO +28 -6
  2. orbitkit-0.8.48/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/audio_transcoder/netmind_extract_v1.py +5 -4
  4. orbitkit-0.8.48/orbitkit/orbit_type/__init__.py +1 -0
  5. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/orbit_type/orbit_type_simple.py +277 -1
  6. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/__init__.py +1 -0
  7. orbitkit-0.8.48/orbitkit/util/util_aws_s3_wrapper.py +369 -0
  8. {orbitkit-0.8.46 → orbitkit-0.8.48/orbitkit.egg-info}/PKG-INFO +28 -6
  9. orbitkit-0.8.48/orbitkit.egg-info/requires.txt +8 -0
  10. {orbitkit-0.8.46 → orbitkit-0.8.48}/setup.py +13 -4
  11. orbitkit-0.8.46/orbitkit/VERSION +0 -1
  12. orbitkit-0.8.46/orbitkit/orbit_type/__init__.py +0 -1
  13. orbitkit-0.8.46/orbitkit/util/util_aws_s3_wrapper.py +0 -154
  14. orbitkit-0.8.46/orbitkit.egg-info/requires.txt +0 -6
  15. {orbitkit-0.8.46 → orbitkit-0.8.48}/LICENSE +0 -0
  16. {orbitkit-0.8.46 → orbitkit-0.8.48}/MANIFEST.in +0 -0
  17. {orbitkit-0.8.46 → orbitkit-0.8.48}/README.md +0 -0
  18. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/__init__.py +0 -0
  19. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/airflow_handler/__init__.py +0 -0
  20. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
  21. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
  22. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  23. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/airflow_handler/file_handler.py +0 -0
  24. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  25. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/audio_transcoder/__init__.py +0 -0
  26. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/constant/__init__.py +0 -0
  27. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/constant/report_schema.py +0 -0
  28. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/id_srv/__init__.py +0 -0
  29. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/id_srv/id_gen.py +0 -0
  30. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/id_srv/id_perm_like.py +0 -0
  31. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/lark_send/__init__.py +0 -0
  32. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/lark_send/lark.py +0 -0
  33. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/llm_tools/__init__.py +0 -0
  34. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  35. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  36. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/orbit_type/tools.py +0 -0
  37. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_embedding/__init__.py +0 -0
  38. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  39. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  40. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor/__init__.py +0 -0
  41. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  42. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor/exceptions.py +0 -0
  43. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  44. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  45. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  46. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  47. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  48. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  49. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  50. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  51. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  52. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  53. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor_simple/base.py +0 -0
  54. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  55. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor_simple/core.py +0 -0
  56. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  57. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  58. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  59. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_writer/__init__.py +0 -0
  60. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  61. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/cache_asset_downloader.py +0 -0
  62. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/common.py +0 -0
  63. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/customize_regix_manager.py +0 -0
  64. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/secret_manager.py +0 -0
  65. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/util_aliyun.py +0 -0
  66. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  67. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/util_aws.py +0 -0
  68. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/util_date.py +0 -0
  69. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/util_html.py +0 -0
  70. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/util_kafka.py +0 -0
  71. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/util_md5.py +0 -0
  72. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/util_selenium.py +0 -0
  73. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/util_simple_timer.py +0 -0
  74. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/util_str.py +0 -0
  75. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/util_type_mapping.py +0 -0
  76. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit/util/util_url.py +0 -0
  77. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit.egg-info/SOURCES.txt +0 -0
  78. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit.egg-info/dependency_links.txt +0 -0
  79. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit.egg-info/not-zip-safe +0 -0
  80. {orbitkit-0.8.46 → orbitkit-0.8.48}/orbitkit.egg-info/top_level.txt +0 -0
  81. {orbitkit-0.8.46 → orbitkit-0.8.48}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.46
3
+ Version: 0.8.48
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -19,15 +19,37 @@ Classifier: Programming Language :: Python :: 3.4
19
19
  Classifier: Programming Language :: Python :: 3.5
20
20
  Classifier: Programming Language :: Python :: 3.6
21
21
  Classifier: Programming Language :: Python :: 3.7
22
+ Classifier: Programming Language :: Python :: 3.8
23
+ Classifier: Programming Language :: Python :: 3.9
24
+ Classifier: Programming Language :: Python :: 3.10
25
+ Classifier: Programming Language :: Python :: 3.11
26
+ Classifier: Programming Language :: Python :: 3.12
27
+ Classifier: Programming Language :: Python :: 3.13
28
+ Classifier: Programming Language :: Python :: 3.14
22
29
  Classifier: Topic :: Software Development :: Libraries
23
30
  Description-Content-Type: text/markdown
24
31
  License-File: LICENSE
25
- Requires-Dist: boto3>=1.16.0
26
- Requires-Dist: requests>=2.12.1
27
- Requires-Dist: prettytable>=3.0.0
28
- Requires-Dist: pytz>=2022.1
32
+ Requires-Dist: boto3>=1.40.46
33
+ Requires-Dist: aioboto3>=15.5.0
34
+ Requires-Dist: aiofiles>=25.1.0
35
+ Requires-Dist: requests>=2.32.5
36
+ Requires-Dist: prettytable>=3.17.0
37
+ Requires-Dist: pytz>=2025.2
29
38
  Requires-Dist: Deprecated
30
39
  Requires-Dist: func_timeout
40
+ Dynamic: author
41
+ Dynamic: author-email
42
+ Dynamic: classifier
43
+ Dynamic: description
44
+ Dynamic: description-content-type
45
+ Dynamic: home-page
46
+ Dynamic: license
47
+ Dynamic: license-file
48
+ Dynamic: maintainer
49
+ Dynamic: maintainer-email
50
+ Dynamic: platform
51
+ Dynamic: requires-dist
52
+ Dynamic: summary
31
53
 
32
54
  # orbitkit
33
55
 
@@ -0,0 +1 @@
1
+ 0.8.48
@@ -74,11 +74,11 @@ class Translate:
74
74
  # "start": 0.031, "end": 2.921},
75
75
  # {"speaker": "SPEAKER_01", "text": "We just wanted to take a minute to thank you.", "start": 3.507, "end": 4.962}],
76
76
  # "id": "0ca63ef01e224adca4865b3cec94c1a2", "model": "WhisperX"}
77
- def text_processing(netmind_data, lang):
77
+ def text_processing(netmind_data, lang, translate_model='gpt-4.1-mini'):
78
78
  import fasttext
79
79
  from urllib.request import urlretrieve
80
80
  model_path = "lid.176.bin"
81
- tran = Translate()
81
+ tran = Translate(model=translate_model)
82
82
  # 如果模型不存在,则下载
83
83
  if not os.path.exists(model_path):
84
84
  logger.info("Downloading fasttext language detection model...")
@@ -145,6 +145,7 @@ def send_request_to_stream(file_steam, **kwargs):
145
145
  def request_wav_from_netmind(s3_client, s3_path=None, file_steam=None, **kwargs):
146
146
  lang = kwargs.get('lang', 'en')
147
147
  folder = kwargs.get('folder', '')
148
+ translate_model = kwargs.get('translate_model', 'gpt-4.1-mini')
148
149
  if s3_path:
149
150
  s3_path_obj = s3_split_path(s3_path)
150
151
  # 开始尝试提取...
@@ -165,10 +166,10 @@ def request_wav_from_netmind(s3_client, s3_path=None, file_steam=None, **kwargs)
165
166
  with open(json_netmind_wav_path, 'w', encoding='utf-8') as json_file:
166
167
  json.dump(data, json_file, ensure_ascii=False, indent=4)
167
168
 
168
- net_process = text_processing(data, lang)
169
+ net_process = text_processing(data, lang, translate_model=translate_model)
169
170
 
170
171
  # 翻译接口处理
171
172
  json_netmind_lang_wav_path = os.path.join(folder, 'netmind_lang_wav.json')
172
173
  with open(json_netmind_lang_wav_path, 'w', encoding='utf-8') as json_file:
173
174
  json.dump(net_process, json_file, ensure_ascii=False, indent=4)
174
- return json_netmind_wav_path, json_netmind_lang_wav_path
175
+ return json_netmind_wav_path, json_netmind_lang_wav_path
@@ -0,0 +1 @@
1
+ from .orbit_type_simple import OrbitTypeMatcher, OrbitTypeMatcherAsync
@@ -2,9 +2,10 @@ import abc
2
2
  import json
3
3
  import logging
4
4
  import re
5
- from typing import List, Literal
5
+ from typing import List, Literal, Optional
6
6
  from botocore.exceptions import ClientError
7
7
  import boto3
8
+ import aioboto3
8
9
 
9
10
  logger = logging.getLogger(__name__)
10
11
 
@@ -459,6 +460,281 @@ class OrbitTypeMatcher:
459
460
  }
460
461
 
461
462
 
463
+ class OrbitTypeMatcherAsync:
464
+ """
465
+ OrbitTypeMatcher 的异步版本
466
+ 使用 aioboto3 进行异步 S3 操作,提升性能
467
+ """
468
+
469
+ def __init__(
470
+ self,
471
+ aws_access_key_id: Optional[str] = None,
472
+ aws_secret_access_key: Optional[str] = None,
473
+ file_bucket: str = 'ot-cdn',
474
+ key_prefix: str = 'orbit-typing/typing-prod'
475
+ ):
476
+ """
477
+ 初始化异步类型匹配器
478
+
479
+ :param aws_access_key_id: AWS access key ID(可选,不提供则使用默认凭证链)
480
+ :param aws_secret_access_key: AWS secret access key(可选)
481
+ :param file_bucket: S3 bucket 名称
482
+ :param key_prefix: S3 key 前缀
483
+ """
484
+ # 创建 aioboto3 session
485
+ if aws_access_key_id and aws_secret_access_key:
486
+ self.session = aioboto3.Session(
487
+ aws_access_key_id=aws_access_key_id,
488
+ aws_secret_access_key=aws_secret_access_key
489
+ )
490
+ else:
491
+ # 使用默认凭证链(环境变量、AWS CLI 配置、IAM 角色等)
492
+ self.session = aioboto3.Session()
493
+
494
+ self.file_bucket = file_bucket
495
+ self.key_prefix = key_prefix
496
+ self.matcher = None
497
+ self.json_file = None
498
+ self.version = None
499
+
500
+ @classmethod
501
+ async def create(
502
+ cls,
503
+ aws_access_key_id: Optional[str] = None,
504
+ aws_secret_access_key: Optional[str] = None,
505
+ file_bucket: str = 'ot-cdn',
506
+ key_prefix: str = 'orbit-typing/typing-prod',
507
+ version: Optional[str] = None
508
+ ):
509
+ """
510
+ 异步工厂方法,用于创建已初始化的 OrbitTypeMatcherAsync 实例
511
+
512
+ :param aws_access_key_id: AWS access key ID(可选)
513
+ :param aws_secret_access_key: AWS secret access key(可选)
514
+ :param file_bucket: S3 bucket 名称
515
+ :param key_prefix: S3 key 前缀
516
+ :param version: 类型版本号(可选,不提供则使用最新版本)
517
+ :return: 已初始化的 OrbitTypeMatcherAsync 实例
518
+
519
+ 示例:
520
+ matcher = await OrbitTypeMatcherAsync.create(version='0.3.4')
521
+ result = matcher.match_type(title='Press Release')
522
+ """
523
+ instance = cls(
524
+ aws_access_key_id=aws_access_key_id,
525
+ aws_secret_access_key=aws_secret_access_key,
526
+ file_bucket=file_bucket,
527
+ key_prefix=key_prefix
528
+ )
529
+
530
+ # 异步加载版本和配置文件
531
+ if not version:
532
+ version = await instance.get_newest_version()
533
+
534
+ source_key = f'{key_prefix}/{version}.json'
535
+ json_file = await instance.read_s3_file(
536
+ bucket_name=file_bucket,
537
+ file_name=source_key
538
+ )
539
+
540
+ if not json_file:
541
+ raise Exception(f'该 S3 文件不存在: {source_key}')
542
+
543
+ instance.json_file = json_file
544
+ instance.matcher = L3RuleListMatcher(json_file)
545
+ instance.version = version
546
+
547
+ return instance
548
+
549
+ async def read_s3_file(self, bucket_name: str, file_name: str):
550
+ """
551
+ 异步读取 S3 中的 JSON 文件
552
+
553
+ :param bucket_name: S3 bucket 名称
554
+ :param file_name: 文件路径
555
+ :return: JSON 数据或 None
556
+ """
557
+ try:
558
+ async with self.session.client('s3') as s3_client:
559
+ file_obj = await s3_client.get_object(Bucket=bucket_name, Key=file_name)
560
+ content = await file_obj['Body'].read()
561
+ lines = content.decode('utf-8')
562
+ json_data = json.loads(lines)
563
+ return json_data
564
+ except ClientError as e:
565
+ logger.error(f"读取 S3 文件失败: {bucket_name}/{file_name}, 错误: {e}")
566
+ return None
567
+
568
+ async def get_newest_version(self):
569
+ """
570
+ 异步获取最新的类型版本号
571
+
572
+ :return: 最新版本号字符串
573
+ """
574
+ async with self.session.client('s3') as s3_client:
575
+ response = await s3_client.list_objects_v2(
576
+ Bucket=self.file_bucket,
577
+ Prefix=self.key_prefix
578
+ )
579
+
580
+ # 遍历存储桶中的对象,获取文件名
581
+ file_names = []
582
+ for obj in response.get('Contents', []):
583
+ key = obj['Key']
584
+ if key.endswith('/'): # 排除文件夹本身
585
+ continue
586
+ file_names.append(key.split('/')[-1]) # 获取文件名部分
587
+
588
+ result = []
589
+ for key in file_names:
590
+ if 'tmp.json' in key:
591
+ continue
592
+ if '.json' not in key:
593
+ continue
594
+ result.append(key.replace('.json', ''))
595
+
596
+ def sort_vision_json_file(filename):
597
+ parts = filename.replace('.json', '').split('.')
598
+ return tuple(int(part) for part in parts)
599
+
600
+ sorted_versions = sorted(result, key=sort_vision_json_file, reverse=True)
601
+ return sorted_versions[0]
602
+
603
+ def get_full_type_list(self):
604
+ """
605
+ 获取完整包含 1、2、3 级类型的列表
606
+ 注意:此方法是同步的,因为它只进行内存操作
607
+
608
+ :return: 完整类型列表
609
+ """
610
+ lv1_list = self.json_file['lv1_list']
611
+ lv1_dict = {}
612
+ for row in lv1_list:
613
+ lv1_id = row['lv1_id']
614
+ name = row['lv1_name']
615
+ lv1_dict[lv1_id] = name
616
+
617
+ lv2_list = self.json_file['lv2_list']
618
+ lv2_dict = {}
619
+ for row in lv2_list:
620
+ lv1_id = row['lv1_id']
621
+ lv2_id = row['lv2_id']
622
+ name = row['lv2_name']
623
+ lv2_dict[lv2_id] = {
624
+ 'lv2_id': lv2_id,
625
+ 'lv2_name': name,
626
+ 'lv1_id': lv1_id,
627
+ 'lv1_name': lv1_dict[lv1_id]
628
+ }
629
+
630
+ lv3_list = self.json_file['lv3_list']
631
+ lv3_data = []
632
+ for row in lv3_list:
633
+ lv2_id = row['level_2_id']
634
+ this_dict = {
635
+ 'lv3_id': row['lv3_id'],
636
+ 'lv3_name': row['lv3_name']
637
+ }
638
+ this_dict.update(lv2_dict[lv2_id])
639
+ lv3_data.append(this_dict)
640
+
641
+ sorted_list = sorted(
642
+ lv3_data,
643
+ key=lambda x: (int(x['lv3_id']), int(x['lv2_id']), int(x['lv1_id']))
644
+ )
645
+ return sorted_list
646
+
647
+ def match_type(self, match_flag: Literal['in_order', 'match_all'] = 'in_order', **match_kwargs):
648
+ """
649
+ 用于匹配的函数,可以传入 n 个匹配参数
650
+ 注意:此方法是同步的,因为匹配操作只涉及内存操作,不需要 I/O
651
+
652
+ :param match_flag: 匹配模式
653
+ - 'in_order': 按顺序依次匹配,匹配到结果就停止
654
+ - 'match_all': 匹配全部的项,并将结果合并
655
+ :param match_kwargs: 匹配参数(如 title='...', url='...', description='...')
656
+ :return: 匹配结果
657
+
658
+ 示例:
659
+ # 按顺序匹配
660
+ key, result = matcher.match_type(
661
+ match_flag='in_order',
662
+ title='Press Release',
663
+ description='Company news'
664
+ )
665
+
666
+ # 匹配所有
667
+ result = matcher.match_type(
668
+ match_flag='match_all',
669
+ title='Press Release',
670
+ url='https://example.com/news/press-release'
671
+ )
672
+ """
673
+ if match_flag not in ['in_order', 'match_all']:
674
+ raise ValueError('match_flag 参数必须是 "in_order" 或 "match_all"')
675
+ if not match_kwargs:
676
+ raise ValueError('必须传入匹配关键词!')
677
+
678
+ default_result = [{'lv3_id': '19999', 'lv3_name': 'Miscellaneous'}]
679
+
680
+ def match_url(url: str):
681
+ split_url = url.replace('http://', '').replace('https://', '').split('/')
682
+ split_url = [x.strip().replace('%20', ' ') for x in split_url if len(x.strip()) > 4]
683
+ for part in reversed(split_url):
684
+ match_result = self.matcher.start_match_all(part)
685
+ if match_result:
686
+ match_key = f'url#{part}'
687
+ return match_key, match_result
688
+ return None, None
689
+
690
+ if match_flag == 'in_order':
691
+ for key, value in match_kwargs.items():
692
+ if key == 'url':
693
+ url_key, match_result = match_url(value)
694
+ if match_result:
695
+ return url_key, match_result
696
+ else:
697
+ match_result = self.matcher.start_match_all(value)
698
+ if match_result:
699
+ return key, match_result
700
+ return None, default_result
701
+
702
+ elif match_flag == 'match_all':
703
+ overall_results = []
704
+ match_detail = {}
705
+ for key, value in match_kwargs.items():
706
+ if key == 'url':
707
+ url_key, match_result = match_url(value)
708
+ if match_result:
709
+ overall_results += match_result
710
+ match_detail[url_key] = match_result
711
+ else:
712
+ match_result = self.matcher.start_match_all(value)
713
+ if match_result:
714
+ overall_results += match_result
715
+ match_detail[key] = match_result
716
+
717
+ if not overall_results:
718
+ overall_results = default_result
719
+
720
+ unique_dict = {d['lv3_id']: d for d in overall_results}
721
+ sorted_results = sorted(unique_dict.values(), key=lambda x: int(x['lv3_id']))
722
+
723
+ return {
724
+ 'results': sorted_results,
725
+ 'match_detail': match_detail
726
+ }
727
+
728
+
462
729
  if __name__ == '__main__':
730
+ # 同步版本示例
463
731
  matcher = OrbitTypeMatcher(version='0.2.1')
464
732
  matcher.match_type(title='asdf')
733
+
734
+ # 异步版本示例
735
+ # import asyncio
736
+ # async def test_async():
737
+ # matcher = await OrbitTypeMatcherAsync.create(version='0.3.4')
738
+ # result = matcher.match_type(match_flag='in_order', title='Press Release')
739
+ # print(result)
740
+ # asyncio.run(test_async())
@@ -49,6 +49,7 @@ from .util_html import (
49
49
  )
50
50
  from .util_aws_s3_wrapper import (
51
51
  AwsS3Wrapper,
52
+ AwsS3WrapperAsync,
52
53
  )
53
54
  from .util_aliyun import (
54
55
  oss_split_path,
@@ -0,0 +1,369 @@
1
+ import logging
2
+ import os.path
3
+ from typing import Optional
4
+ import boto3
5
+ from orbitkit.util import get_from_dict_or_env, s3_split_path, get_content_type_4_filename
6
+ import botocore
7
+ from botocore.exceptions import ClientError
8
+ import aioboto3
9
+ import aiofiles
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class AwsS3Wrapper:
15
+ """Encapsulates Amazon s3 actions for Orbitfin"""
16
+
17
+ def __init__(self, s3_resource, s3_client):
18
+ """
19
+ :param s3_resource: boto3.resource('s3')
20
+ :param s3_client: boto3.client('s3')
21
+ """
22
+ self.s3_resource = s3_resource
23
+ self.s3_client = s3_client
24
+
25
+ @classmethod
26
+ def from_s3(cls, *args, **kwargs):
27
+ # Try to get key aws pair
28
+ aws_access_key_id = get_from_dict_or_env(
29
+ kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID",
30
+ )
31
+
32
+ aws_secret_access_key = get_from_dict_or_env(
33
+ kwargs, "aws_secret_access_key", "AWS_SECRET_ACCESS_KEY",
34
+ )
35
+
36
+ s3_resource = boto3.resource('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
37
+ s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
38
+
39
+ return cls(s3_resource, s3_client)
40
+
41
+ def get_s3_resource(self):
42
+ return self.s3_resource
43
+
44
+ def get_s3_client(self):
45
+ return self.s3_client
46
+
47
+ def check_file_exist(self, s3_path: str) -> bool:
48
+ """
49
+ :param s3_path: Target store path for s3.
50
+ :return:
51
+ """
52
+
53
+ s3_path_obj = s3_split_path(s3_path)
54
+ try:
55
+ self.s3_resource.Object(s3_path_obj["bucket"], s3_path_obj["store_path"]).load()
56
+ except botocore.exceptions.ClientError as e:
57
+ if e.response['Error']['Code'] == "404":
58
+ # The object does not exist.
59
+ return False
60
+ else:
61
+ # Something else has gone wrong.
62
+ raise Exception("Check s3 file exist unknown error...")
63
+ else:
64
+ # The object does exist.
65
+ return True
66
+
67
+ def copy_file(self, source_path: str, target_path: str):
68
+ """
69
+ :param source_path: Source s3 path location
70
+ :param target_path: Target s3 path location
71
+ :return:
72
+ """
73
+ source_path_obj = s3_split_path(source_path)
74
+ target_path_obj = s3_split_path(target_path)
75
+
76
+ self.s3_resource.Object(target_path_obj["bucket"], target_path_obj["store_path"]).copy_from(
77
+ CopySource=source_path_obj["bucket"] + '/' + source_path_obj["store_path"],
78
+ )
79
+
80
+ def delete_file(self, s3_path: str):
81
+ """
82
+ :param s3_path: Target store path for s3.
83
+ :return:
84
+ """
85
+ s3_path_obj = s3_split_path(s3_path)
86
+ self.s3_resource.Object(s3_path_obj["bucket"], s3_path_obj["store_path"]).delete()
87
+
88
+ def download_file(self, s3_path: str, local_path: str, filename: str):
89
+ """
90
+ :param s3_path: Target store path for s3.
91
+ :param local_path: Local path
92
+ :param filename: File name
93
+ :return:
94
+ """
95
+ if not os.path.exists(local_path):
96
+ os.makedirs(local_path)
97
+
98
+ s3_path_obj = s3_split_path(s3_path)
99
+ self.s3_resource.Bucket(s3_path_obj["bucket"]).download_file(s3_path_obj["store_path"], os.path.join(local_path, filename))
100
+
101
+ def upload_by_local_path(self, s3_path: str, local_path: str, text_with_utf8: bool = True):
102
+ """
103
+ :param s3_path: Target store path for s3.
104
+ :param local_path: Local file path.
105
+ :param text_with_utf8: If content-type start with "text/" then put ;charset=utf-8 after.
106
+ :return:
107
+ """
108
+ if not os.path.exists(local_path):
109
+ raise Exception("Local file doesn't exist!")
110
+
111
+ content_type = get_content_type_4_filename(s3_path, text_with_utf8)
112
+ s3_path_obj = s3_split_path(s3_path)
113
+
114
+ self.s3_client.upload_file(local_path,
115
+ s3_path_obj["bucket"],
116
+ s3_path_obj["store_path"],
117
+ ExtraArgs={'ContentType': content_type})
118
+
119
+ def upload_file(self, s3_path: str, content: bytes, metadata: Optional[dict] = None, text_with_utf8: bool = True):
120
+ """
121
+ :param s3_path: Target store path for s3.
122
+ :param content: The content of file, if text-like use content.encode("utf-8"), if binary then put directly.
123
+ :param metadata: Custom metadata for file.
124
+ :param text_with_utf8: If content-type start with "text/" then put ;charset=utf-8 after.
125
+ :return:
126
+ """
127
+ s3_path_obj = s3_split_path(s3_path)
128
+ content_type = get_content_type_4_filename(s3_path, text_with_utf8)
129
+
130
+ object_put = self.s3_resource.Object(s3_path_obj["bucket"], s3_path_obj["store_path"])
131
+ if metadata:
132
+ object_put.put(Body=content, ContentType=content_type, Metadata=metadata)
133
+ else:
134
+ object_put.put(Body=content, ContentType=content_type)
135
+
136
+ def get_file_meta_info(self, s3_path: str) -> dict:
137
+ """
138
+ :param s3_path: Target store path for s3.
139
+ :return:
140
+ """
141
+ s3_path_obj = s3_split_path(s3_path)
142
+ response = self.s3_client.head_object(Bucket=s3_path_obj["bucket"], Key=s3_path_obj["store_path"])
143
+ return {
144
+ "content_type": response['ContentType'],
145
+ "metadata": response['Metadata'],
146
+ }
147
+
148
+ def read_text_like_file(self, s3_path: str, decoding: str = "utf-8") -> str:
149
+ """
150
+ :param s3_path: Target store path for s3.
151
+ :param decoding: decoding, default is "utf-8".
152
+ :return:
153
+ """
154
+ s3_path_obj = s3_split_path(s3_path)
155
+ obj = self.s3_client.get_object(Bucket=s3_path_obj["bucket"], Key=s3_path_obj["store_path"])
156
+ return obj['Body'].read().decode(decoding)
157
+
158
+
159
+ class AwsS3WrapperAsync:
160
+ """Encapsulates Amazon S3 async actions for Orbitfin using aioboto3"""
161
+
162
+ def __init__(self, aws_access_key_id: Optional[str] = None, aws_secret_access_key: Optional[str] = None):
163
+ """
164
+ 初始化异步 S3 包装器
165
+
166
+ :param aws_access_key_id: AWS access key ID(可选,不提供则使用 AWS CLI 配置或环境变量)
167
+ :param aws_secret_access_key: AWS secret access key(可选)
168
+
169
+ 凭证获取顺序:
170
+ 1. 直接传入的参数
171
+ 2. 环境变量(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
172
+ 3. AWS CLI 配置文件(~/.aws/credentials)
173
+ 4. IAM 角色(如果在 EC2/ECS 上运行)
174
+ """
175
+ self.aws_access_key_id = aws_access_key_id
176
+ self.aws_secret_access_key = aws_secret_access_key
177
+
178
+ # 如果提供了凭证,使用指定凭证;否则使用默认凭证链
179
+ if aws_access_key_id and aws_secret_access_key:
180
+ self.session = aioboto3.Session(
181
+ aws_access_key_id=aws_access_key_id,
182
+ aws_secret_access_key=aws_secret_access_key
183
+ )
184
+ else:
185
+ # 使用默认凭证链(环境变量、AWS CLI 配置、IAM 角色等)
186
+ self.session = aioboto3.Session()
187
+
188
+ @classmethod
189
+ def from_s3(cls, *args, **kwargs):
190
+ """
191
+ 创建 AwsS3WrapperAsync 实例
192
+
193
+ 支持从以下来源获取 AWS 凭证(按优先级):
194
+ 1. kwargs 参数(aws_access_key_id, aws_secret_access_key)
195
+ 2. 环境变量(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
196
+ 3. AWS CLI 配置文件(~/.aws/credentials)
197
+ 4. IAM 角色
198
+
199
+ 示例:
200
+ # 使用默认凭证链(推荐)
201
+ wrapper = AwsS3WrapperAsync.from_s3()
202
+
203
+ # 显式指定凭证
204
+ wrapper = AwsS3WrapperAsync.from_s3(
205
+ aws_access_key_id="xxx",
206
+ aws_secret_access_key="yyy"
207
+ )
208
+ """
209
+ # 尝试从 kwargs 或环境变量获取凭证(可选)
210
+ aws_access_key_id = kwargs.get("aws_access_key_id") or os.environ.get("AWS_ACCESS_KEY_ID")
211
+ aws_secret_access_key = kwargs.get("aws_secret_access_key") or os.environ.get("AWS_SECRET_ACCESS_KEY")
212
+
213
+ return cls(aws_access_key_id, aws_secret_access_key)
214
+
215
+ async def check_file_exist(self, s3_path: str) -> bool:
216
+ """
217
+ 检查 S3 文件是否存在
218
+ :param s3_path: S3 路径
219
+ :return: 文件是否存在
220
+ """
221
+ s3_path_obj = s3_split_path(s3_path)
222
+
223
+ async with self.session.client('s3') as s3_client:
224
+ try:
225
+ await s3_client.head_object(
226
+ Bucket=s3_path_obj["bucket"],
227
+ Key=s3_path_obj["store_path"]
228
+ )
229
+ return True
230
+ except ClientError as e:
231
+ if e.response['Error']['Code'] == "404":
232
+ return False
233
+ else:
234
+ raise Exception("Check s3 file exist unknown error...")
235
+
236
+ async def copy_file(self, source_path: str, target_path: str):
237
+ """
238
+ 复制 S3 文件
239
+ :param source_path: 源 S3 路径
240
+ :param target_path: 目标 S3 路径
241
+ """
242
+ source_path_obj = s3_split_path(source_path)
243
+ target_path_obj = s3_split_path(target_path)
244
+
245
+ async with self.session.client('s3') as s3_client:
246
+ copy_source = {
247
+ 'Bucket': source_path_obj["bucket"],
248
+ 'Key': source_path_obj["store_path"]
249
+ }
250
+ await s3_client.copy_object(
251
+ CopySource=copy_source,
252
+ Bucket=target_path_obj["bucket"],
253
+ Key=target_path_obj["store_path"]
254
+ )
255
+
256
+ async def delete_file(self, s3_path: str):
257
+ """
258
+ 删除 S3 文件
259
+ :param s3_path: S3 路径
260
+ """
261
+ s3_path_obj = s3_split_path(s3_path)
262
+
263
+ async with self.session.client('s3') as s3_client:
264
+ await s3_client.delete_object(
265
+ Bucket=s3_path_obj["bucket"],
266
+ Key=s3_path_obj["store_path"]
267
+ )
268
+
269
+ async def download_file(self, s3_path: str, local_path: str, filename: str):
270
+ """
271
+ 从 S3 下载文件到本地
272
+ :param s3_path: S3 路径
273
+ :param local_path: 本地目录路径
274
+ :param filename: 文件名
275
+ """
276
+ if not os.path.exists(local_path):
277
+ os.makedirs(local_path)
278
+
279
+ s3_path_obj = s3_split_path(s3_path)
280
+ local_file_path = os.path.join(local_path, filename)
281
+
282
+ async with self.session.client('s3') as s3_client:
283
+ response = await s3_client.get_object(
284
+ Bucket=s3_path_obj["bucket"],
285
+ Key=s3_path_obj["store_path"]
286
+ )
287
+ async with aiofiles.open(local_file_path, 'wb') as f:
288
+ await f.write(await response['Body'].read())
289
+
290
+ async def upload_by_local_path(self, s3_path: str, local_path: str, text_with_utf8: bool = True):
291
+ """
292
+ 从本地路径上传文件到 S3
293
+ :param s3_path: S3 目标路径
294
+ :param local_path: 本地文件路径
295
+ :param text_with_utf8: 如果是文本文件,是否添加 utf-8 编码标识
296
+ """
297
+ if not os.path.exists(local_path):
298
+ raise Exception("Local file doesn't exist!")
299
+
300
+ content_type = get_content_type_4_filename(s3_path, text_with_utf8)
301
+ s3_path_obj = s3_split_path(s3_path)
302
+
303
+ async with self.session.client('s3') as s3_client:
304
+ async with aiofiles.open(local_path, 'rb') as f:
305
+ content = await f.read()
306
+ await s3_client.put_object(
307
+ Bucket=s3_path_obj["bucket"],
308
+ Key=s3_path_obj["store_path"],
309
+ Body=content,
310
+ ContentType=content_type
311
+ )
312
+
313
+ async def upload_file(self, s3_path: str, content: bytes, metadata: Optional[dict] = None, text_with_utf8: bool = True):
314
+ """
315
+ 上传文件内容到 S3
316
+ :param s3_path: S3 目标路径
317
+ :param content: 文件内容(字节)
318
+ :param metadata: 自定义元数据
319
+ :param text_with_utf8: 如果是文本文件,是否添加 utf-8 编码标识
320
+ """
321
+ s3_path_obj = s3_split_path(s3_path)
322
+ content_type = get_content_type_4_filename(s3_path, text_with_utf8)
323
+
324
+ async with self.session.client('s3') as s3_client:
325
+ put_args = {
326
+ 'Bucket': s3_path_obj["bucket"],
327
+ 'Key': s3_path_obj["store_path"],
328
+ 'Body': content,
329
+ 'ContentType': content_type
330
+ }
331
+ if metadata:
332
+ put_args['Metadata'] = metadata
333
+
334
+ await s3_client.put_object(**put_args)
335
+
336
+ async def get_file_meta_info(self, s3_path: str) -> dict:
337
+ """
338
+ 获取 S3 文件的元信息
339
+ :param s3_path: S3 路径
340
+ :return: 包含 content_type 和 metadata 的字典
341
+ """
342
+ s3_path_obj = s3_split_path(s3_path)
343
+
344
+ async with self.session.client('s3') as s3_client:
345
+ response = await s3_client.head_object(
346
+ Bucket=s3_path_obj["bucket"],
347
+ Key=s3_path_obj["store_path"]
348
+ )
349
+ return {
350
+ "content_type": response['ContentType'],
351
+ "metadata": response['Metadata'],
352
+ }
353
+
354
+ async def read_text_like_file(self, s3_path: str, decoding: str = "utf-8") -> str:
355
+ """
356
+ 读取 S3 文本文件内容
357
+ :param s3_path: S3 路径
358
+ :param decoding: 解码方式,默认 utf-8
359
+ :return: 文件文本内容
360
+ """
361
+ s3_path_obj = s3_split_path(s3_path)
362
+
363
+ async with self.session.client('s3') as s3_client:
364
+ response = await s3_client.get_object(
365
+ Bucket=s3_path_obj["bucket"],
366
+ Key=s3_path_obj["store_path"]
367
+ )
368
+ content = await response['Body'].read()
369
+ return content.decode(decoding)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.46
3
+ Version: 0.8.48
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -19,15 +19,37 @@ Classifier: Programming Language :: Python :: 3.4
19
19
  Classifier: Programming Language :: Python :: 3.5
20
20
  Classifier: Programming Language :: Python :: 3.6
21
21
  Classifier: Programming Language :: Python :: 3.7
22
+ Classifier: Programming Language :: Python :: 3.8
23
+ Classifier: Programming Language :: Python :: 3.9
24
+ Classifier: Programming Language :: Python :: 3.10
25
+ Classifier: Programming Language :: Python :: 3.11
26
+ Classifier: Programming Language :: Python :: 3.12
27
+ Classifier: Programming Language :: Python :: 3.13
28
+ Classifier: Programming Language :: Python :: 3.14
22
29
  Classifier: Topic :: Software Development :: Libraries
23
30
  Description-Content-Type: text/markdown
24
31
  License-File: LICENSE
25
- Requires-Dist: boto3>=1.16.0
26
- Requires-Dist: requests>=2.12.1
27
- Requires-Dist: prettytable>=3.0.0
28
- Requires-Dist: pytz>=2022.1
32
+ Requires-Dist: boto3>=1.40.46
33
+ Requires-Dist: aioboto3>=15.5.0
34
+ Requires-Dist: aiofiles>=25.1.0
35
+ Requires-Dist: requests>=2.32.5
36
+ Requires-Dist: prettytable>=3.17.0
37
+ Requires-Dist: pytz>=2025.2
29
38
  Requires-Dist: Deprecated
30
39
  Requires-Dist: func_timeout
40
+ Dynamic: author
41
+ Dynamic: author-email
42
+ Dynamic: classifier
43
+ Dynamic: description
44
+ Dynamic: description-content-type
45
+ Dynamic: home-page
46
+ Dynamic: license
47
+ Dynamic: license-file
48
+ Dynamic: maintainer
49
+ Dynamic: maintainer-email
50
+ Dynamic: platform
51
+ Dynamic: requires-dist
52
+ Dynamic: summary
31
53
 
32
54
  # orbitkit
33
55
 
@@ -0,0 +1,8 @@
1
+ boto3>=1.40.46
2
+ aioboto3>=15.5.0
3
+ aiofiles>=25.1.0
4
+ requests>=2.32.5
5
+ prettytable>=3.17.0
6
+ pytz>=2025.2
7
+ Deprecated
8
+ func_timeout
@@ -32,15 +32,24 @@ setup(
32
32
  'Programming Language :: Python :: 3.5',
33
33
  'Programming Language :: Python :: 3.6',
34
34
  'Programming Language :: Python :: 3.7',
35
+ 'Programming Language :: Python :: 3.8',
36
+ 'Programming Language :: Python :: 3.9',
37
+ 'Programming Language :: Python :: 3.10',
38
+ 'Programming Language :: Python :: 3.11',
39
+ 'Programming Language :: Python :: 3.12',
40
+ 'Programming Language :: Python :: 3.13',
41
+ 'Programming Language :: Python :: 3.14',
35
42
  'Topic :: Software Development :: Libraries'
36
43
  ],
37
44
  include_package_data=True,
38
45
  zip_safe=False,
39
46
  install_requires=[
40
- "boto3 >= 1.16.0",
41
- "requests >= 2.12.1",
42
- "prettytable >= 3.0.0",
43
- "pytz >= 2022.1",
47
+ "boto3 >= 1.40.46",
48
+ "aioboto3 >= 15.5.0",
49
+ "aiofiles >= 25.1.0",
50
+ "requests >= 2.32.5",
51
+ "prettytable >= 3.17.0",
52
+ "pytz >= 2025.2",
44
53
  "Deprecated",
45
54
  "func_timeout",
46
55
  ]
@@ -1 +0,0 @@
1
- 0.8.46
@@ -1 +0,0 @@
1
- from .orbit_type_simple import OrbitTypeMatcher
@@ -1,154 +0,0 @@
1
- import logging
2
- import os.path
3
- from typing import Optional
4
- import boto3
5
- from orbitkit.util import get_from_dict_or_env, s3_split_path, get_content_type_4_filename
6
- import botocore
7
- from botocore.exceptions import ClientError
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
-
12
- class AwsS3Wrapper:
13
- """Encapsulates Amazon s3 actions for Orbitfin"""
14
-
15
- def __init__(self, s3_resource, s3_client):
16
- """
17
- :param s3_resource: boto3.resource('s3')
18
- :param s3_client: boto3.client('s3')
19
- """
20
- self.s3_resource = s3_resource
21
- self.s3_client = s3_client
22
-
23
- @classmethod
24
- def from_s3(cls, *args, **kwargs):
25
- # Try to get key aws pair
26
- aws_access_key_id = get_from_dict_or_env(
27
- kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID",
28
- )
29
-
30
- aws_secret_access_key = get_from_dict_or_env(
31
- kwargs, "aws_secret_access_key", "AWS_SECRET_ACCESS_KEY",
32
- )
33
-
34
- s3_resource = boto3.resource('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
35
- s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
36
-
37
- return cls(s3_resource, s3_client)
38
-
39
- def get_s3_resource(self):
40
- return self.s3_resource
41
-
42
- def get_s3_client(self):
43
- return self.s3_client
44
-
45
- def check_file_exist(self, s3_path: str) -> bool:
46
- """
47
- :param s3_path: Target store path for s3.
48
- :return:
49
- """
50
-
51
- s3_path_obj = s3_split_path(s3_path)
52
- try:
53
- self.s3_resource.Object(s3_path_obj["bucket"], s3_path_obj["store_path"]).load()
54
- except botocore.exceptions.ClientError as e:
55
- if e.response['Error']['Code'] == "404":
56
- # The object does not exist.
57
- return False
58
- else:
59
- # Something else has gone wrong.
60
- raise Exception("Check s3 file exist unknown error...")
61
- else:
62
- # The object does exist.
63
- return True
64
-
65
- def copy_file(self, source_path: str, target_path: str):
66
- """
67
- :param source_path: Source s3 path location
68
- :param target_path: Target s3 path location
69
- :return:
70
- """
71
- source_path_obj = s3_split_path(source_path)
72
- target_path_obj = s3_split_path(target_path)
73
-
74
- self.s3_resource.Object(target_path_obj["bucket"], target_path_obj["store_path"]).copy_from(
75
- CopySource=source_path_obj["bucket"] + '/' + source_path_obj["store_path"],
76
- )
77
-
78
- def delete_file(self, s3_path: str):
79
- """
80
- :param s3_path: Target store path for s3.
81
- :return:
82
- """
83
- s3_path_obj = s3_split_path(s3_path)
84
- self.s3_resource.Object(s3_path_obj["bucket"], s3_path_obj["store_path"]).delete()
85
-
86
- def download_file(self, s3_path: str, local_path: str, filename: str):
87
- """
88
- :param s3_path: Target store path for s3.
89
- :param local_path: Local path
90
- :param filename: File name
91
- :return:
92
- """
93
- if not os.path.exists(local_path):
94
- os.makedirs(local_path)
95
-
96
- s3_path_obj = s3_split_path(s3_path)
97
- self.s3_resource.Bucket(s3_path_obj["bucket"]).download_file(s3_path_obj["store_path"], os.path.join(local_path, filename))
98
-
99
- def upload_by_local_path(self, s3_path: str, local_path: str, text_with_utf8: bool = True):
100
- """
101
- :param s3_path: Target store path for s3.
102
- :param local_path: Local file path.
103
- :param text_with_utf8: If content-type start with "text/" then put ;charset=utf-8 after.
104
- :return:
105
- """
106
- if not os.path.exists(local_path):
107
- raise Exception("Local file doesn't exist!")
108
-
109
- content_type = get_content_type_4_filename(s3_path, text_with_utf8)
110
- s3_path_obj = s3_split_path(s3_path)
111
-
112
- self.s3_client.upload_file(local_path,
113
- s3_path_obj["bucket"],
114
- s3_path_obj["store_path"],
115
- ExtraArgs={'ContentType': content_type})
116
-
117
- def upload_file(self, s3_path: str, content: bytes, metadata: Optional[dict] = None, text_with_utf8: bool = True):
118
- """
119
- :param s3_path: Target store path for s3.
120
- :param content: The content of file, if text-like use content.encode("utf-8"), if binary then put directly.
121
- :param metadata: Custom metadata for file.
122
- :param text_with_utf8: If content-type start with "text/" then put ;charset=utf-8 after.
123
- :return:
124
- """
125
- s3_path_obj = s3_split_path(s3_path)
126
- content_type = get_content_type_4_filename(s3_path, text_with_utf8)
127
-
128
- object_put = self.s3_resource.Object(s3_path_obj["bucket"], s3_path_obj["store_path"])
129
- if metadata:
130
- object_put.put(Body=content, ContentType=content_type, Metadata=metadata)
131
- else:
132
- object_put.put(Body=content, ContentType=content_type)
133
-
134
- def get_file_meta_info(self, s3_path: str) -> dict:
135
- """
136
- :param s3_path: Target store path for s3.
137
- :return:
138
- """
139
- s3_path_obj = s3_split_path(s3_path)
140
- response = self.s3_client.head_object(Bucket=s3_path_obj["bucket"], Key=s3_path_obj["store_path"])
141
- return {
142
- "content_type": response['ContentType'],
143
- "metadata": response['Metadata'],
144
- }
145
-
146
- def read_text_like_file(self, s3_path: str, decoding: str = "utf-8") -> str:
147
- """
148
- :param s3_path: Target store path for s3.
149
- :param decoding: decoding, default is "utf-8".
150
- :return:
151
- """
152
- s3_path_obj = s3_split_path(s3_path)
153
- obj = self.s3_client.get_object(Bucket=s3_path_obj["bucket"], Key=s3_path_obj["store_path"])
154
- return obj['Body'].read().decode(decoding)
@@ -1,6 +0,0 @@
1
- boto3>=1.16.0
2
- requests>=2.12.1
3
- prettytable>=3.0.0
4
- pytz>=2022.1
5
- Deprecated
6
- func_timeout
File without changes
File without changes
File without changes
File without changes