cobweb-launcher 3.1.29__tar.gz → 3.1.31__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/PKG-INFO +1 -1
  2. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/base/request.py +124 -2
  3. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb_launcher.egg-info/PKG-INFO +1 -1
  4. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/setup.py +1 -1
  5. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/LICENSE +0 -0
  6. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/README.md +0 -0
  7. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/__init__.py +0 -0
  8. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/base/__init__.py +0 -0
  9. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/base/common_queue.py +0 -0
  10. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/base/item.py +0 -0
  11. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/base/logger.py +0 -0
  12. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/base/response.py +0 -0
  13. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/base/seed.py +0 -0
  14. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/base/task_queue.py +0 -0
  15. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/base/test.py +0 -0
  16. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/constant.py +0 -0
  17. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/crawlers/__init__.py +0 -0
  18. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/crawlers/crawler.py +0 -0
  19. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/db/__init__.py +0 -0
  20. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/db/api_db.py +0 -0
  21. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/db/redis_db.py +0 -0
  22. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/exceptions/__init__.py +0 -0
  23. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/exceptions/oss_db_exception.py +0 -0
  24. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/launchers/__init__.py +0 -0
  25. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/launchers/distributor.py +0 -0
  26. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/launchers/launcher.py +0 -0
  27. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/launchers/uploader.py +0 -0
  28. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/pipelines/__init__.py +0 -0
  29. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/pipelines/pipeline.py +0 -0
  30. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/pipelines/pipeline_csv.py +0 -0
  31. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/pipelines/pipeline_loghub.py +0 -0
  32. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/schedulers/__init__.py +0 -0
  33. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/schedulers/scheduler.py +0 -0
  34. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/schedulers/scheduler_with_redis.py +0 -0
  35. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/setting.py +0 -0
  36. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/utils/__init__.py +0 -0
  37. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/utils/bloom.py +0 -0
  38. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/utils/decorators.py +0 -0
  39. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/utils/dotting.py +0 -0
  40. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/utils/oss.py +0 -0
  41. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb/utils/tools.py +0 -0
  42. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb_launcher.egg-info/SOURCES.txt +0 -0
  43. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  44. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb_launcher.egg-info/requires.txt +0 -0
  45. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/cobweb_launcher.egg-info/top_level.txt +0 -0
  46. {cobweb-launcher-3.1.29 → cobweb-launcher-3.1.31}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 3.1.29
3
+ Version: 3.1.31
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -1,5 +1,7 @@
1
1
  import random
2
2
  import logging
3
+ import time
4
+
3
5
  import requests
4
6
 
5
7
  from urllib.parse import urlparse
@@ -215,7 +217,7 @@ class FileTypeDetector:
215
217
  result['methods_used'].append('extension')
216
218
 
217
219
  # 3. 如果前两种方法不确定,使用文件签名检测
218
- if result['confidence'] in ['unknown', 'medium']:
220
+ if data and result['confidence'] in ['unknown', 'medium']:
219
221
  signature_detected = self.detect_by_signature(data)
220
222
  if signature_detected:
221
223
  if not result['detected_type']:
@@ -309,6 +311,8 @@ class Request:
309
311
  """
310
312
  self.scheme = None
311
313
  self.netloc = None
314
+ self.detector_info = None
315
+ self.content_length = None
312
316
  self._validate_url(url)
313
317
 
314
318
  self.url = url
@@ -408,6 +412,124 @@ class Request:
408
412
  """下载方法,为了向后兼容性保留"""
409
413
  return self.execute()
410
414
 
415
+ def normal_download(self, file_type_detect: bool = True) -> bytes:
416
+ """普通下载模式"""
417
+ detect_settings = self.request_settings.copy()
418
+ detect_settings.pop('stream', None)
419
+
420
+ response = requests.request(
421
+ method=self.method,
422
+ url=self.url,
423
+ **detect_settings
424
+ )
425
+
426
+ if self.check_status_code:
427
+ response.raise_for_status()
428
+
429
+ content_type = response.headers.get('content-type')
430
+ result = response.content
431
+ response.close()
432
+
433
+ if file_type_detect and not self.detector_info:
434
+ head_data = result[:64]
435
+ detector = FileTypeDetector()
436
+ self.detector_info = detector.get_detailed_info(
437
+ url=self.url, content_type=content_type, data=head_data
438
+ )
439
+
440
+ return result
441
+
442
+ def range_download(self, start: int = 0, chunk_size: int = 1024, file_type_detect: bool = True) -> iter(bytes):
443
+ # 分块下载
444
+ downloaded = start
445
+ retry_count = 0
446
+ max_retries = 3
447
+
448
+ detect_settings = self.request_settings.copy()
449
+ detect_settings.pop('stream', None)
450
+
451
+ if file_type_detect and not self.detector_info:
452
+ detect_settings.setdefault("headers", {})['Range'] = "bytes=0-63"
453
+ test_response = requests.request(
454
+ method=self.method,
455
+ url=self.url,
456
+ **detect_settings
457
+ )
458
+ content_type = test_response.headers.get("Content-Type")
459
+ head_data = test_response.content
460
+ test_response.close()
461
+
462
+ detector = FileTypeDetector()
463
+ self.detector_info = detector.get_detailed_info(
464
+ url=self.url, content_type=content_type, data=head_data
465
+ )
466
+
467
+ while downloaded < self.content_length:
468
+ _start = downloaded
469
+ _end = min(downloaded + chunk_size - 1, self.total_size - 1)
470
+ detect_settings.setdefault("headers", {})['Range'] = f"bytes={_start}-{_end}"
471
+
472
+ response = requests.request(
473
+ method=self.method,
474
+ url=self.url,
475
+ **detect_settings
476
+ )
477
+
478
+ if response.status_code == 206:
479
+ chunk_data = response.content
480
+ yield chunk_data
481
+ downloaded += len(chunk_data)
482
+ retry_count = 0 # 重置重试计数
483
+ logging.info(f"下载进度: {downloaded}/{self.total_size} ({downloaded / self.total_size * 100:.1f}%)")
484
+
485
+ elif response.status_code == 416: # Range Not Satisfiable
486
+ logging.info("Range请求超出范围")
487
+ break
488
+
489
+ else:
490
+ logging.debug(f"Range请求失败: {response.status_code}")
491
+ if retry_count < max_retries:
492
+ retry_count += 1
493
+ time.sleep(0.5)
494
+ continue
495
+
496
+ response.close()
497
+
498
+ def detect_accept_ranges(self) -> bool:
499
+ detect_settings = self.request_settings.copy()
500
+ detect_settings.pop('stream', None)
501
+
502
+ head_response = requests.head(self.url, **detect_settings)
503
+ if head_response.status_code not in [200, 206]:
504
+ logging.error(f"HEAD请求失败: {head_response.status_code}")
505
+ raise ValueError("HTTP状态码错误")
506
+
507
+ self.content_length = int(head_response.headers.get('content-length', 0))
508
+ accept_ranges = str(head_response.headers.get('accept-ranges')).lower()
509
+
510
+ supports_range = True
511
+
512
+ # 根据检测结果使用不同下载方式
513
+ if accept_ranges == 'none' or not self.content_length:
514
+ supports_range = False
515
+ else:
516
+ test_range_settings = detect_settings.copy()
517
+ test_range_settings.setdefault("headers", {})['Range'] = "bytes=0-63"
518
+ test_response = requests.request(
519
+ method=self.method,
520
+ url=self.url,
521
+ **test_range_settings
522
+ )
523
+ if test_response.status_code == 206:
524
+ actual_length = len(test_response.content)
525
+ if actual_length != 64:
526
+ logging.debug(f"⚠️ Range请求返回长度不匹配: 期望64, 实际{actual_length}")
527
+ supports_range = False
528
+ else:
529
+ supports_range = False
530
+
531
+ return supports_range
532
+
411
533
  def detect_file_type(self) -> Dict[str, Any]:
412
534
  """
413
535
  检测文件类型。
@@ -452,7 +574,7 @@ class Request:
452
574
 
453
575
  @property
454
576
  def to_dict(self) -> Dict[str, Any]:
455
- excluded_keys = {"request_settings"}
577
+ excluded_keys = {"request_settings", "url", "seed", "method", "check_status_code"}
456
578
  result = {
457
579
  key: value for key, value in self.__dict__.items()
458
580
  if not key.startswith('_') and key not in excluded_keys
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 3.1.29
3
+ Version: 3.1.31
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="3.1.29",
8
+ version="3.1.31",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",