scrapy-item-ingest 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of scrapy-item-ingest might be problematic. Click here for more details.

@@ -12,7 +12,7 @@ Main Components:
12
12
  - RequestsPipeline: Standalone requests tracking pipeline
13
13
  """
14
14
 
15
- __version__ = "0.1.0"
15
+ __version__ = "0.1.1"
16
16
  __author__ = "Fawad Ali"
17
17
  __description__ = "Scrapy extension for database ingestion with job/spider tracking"
18
18
 
@@ -2,15 +2,61 @@
2
2
  Logging extension for tracking spider events.
3
3
  """
4
4
  import logging
5
+ import threading
5
6
  from scrapy import signals
6
7
  from .base import BaseExtension
7
8
 
8
9
  logger = logging.getLogger(__name__)
9
10
 
10
11
 
12
+ class DatabaseLogHandler(logging.Handler):
13
+ """Custom logging handler to save all log records to the database in batches."""
14
+ _local = threading.local()
15
+ BATCH_SIZE = 100
16
+
17
+ def __init__(self, extension, spider):
18
+ super().__init__()
19
+ self.extension = extension
20
+ self.spider = spider
21
+ self._buffer = []
22
+
23
+ def emit(self, record):
24
+ if getattr(self._local, 'in_emit', False):
25
+ return # Prevent recursion
26
+ self._local.in_emit = True
27
+ try:
28
+ # Format the log message
29
+ msg = self.format(record)
30
+ level = record.levelname
31
+ self._buffer.append((self.spider, level, msg))
32
+ if len(self._buffer) >= self.BATCH_SIZE:
33
+ self.flush()
34
+ except Exception:
35
+ # Avoid infinite recursion if logging fails
36
+ pass
37
+ finally:
38
+ self._local.in_emit = False
39
+
40
+ def flush(self):
41
+ if not self._buffer:
42
+ return
43
+ try:
44
+ for spider, level, msg in self._buffer:
45
+ self.extension._log_to_database(spider, level, msg)
46
+ except Exception:
47
+ pass
48
+ finally:
49
+ self._buffer.clear()
50
+
51
+
11
52
  class LoggingExtension(BaseExtension):
12
53
  """Extension for logging spider events to database"""
13
54
 
55
+ def __init__(self, settings):
56
+ super().__init__(settings)
57
+ self._db_log_handler = None
58
+ self._spider = None
59
+
14
60
  @classmethod
15
61
  def from_crawler(cls, crawler):
16
62
  """Create extension instance from crawler"""
@@ -26,20 +72,31 @@ class LoggingExtension(BaseExtension):
26
72
  """Called when spider is opened"""
27
73
  identifier_column, identifier_value = self.get_identifier_info(spider)
28
74
  message = f"{identifier_column.title()} {identifier_value} started"
29
- self._log_to_database(spider, "SPIDER_OPENED", message)
75
+ self._log_to_database(spider, "INFO", message)
76
+ # Attach custom DB log handler to root logger
77
+ self._spider = spider
78
+ self._db_log_handler = DatabaseLogHandler(self, spider)
79
+ self._db_log_handler.setLevel(logging.DEBUG)
80
+ logging.getLogger().addHandler(self._db_log_handler)
30
81
 
31
82
  def spider_closed(self, spider, reason):
32
83
  """Called when spider is closed"""
33
84
  identifier_column, identifier_value = self.get_identifier_info(spider)
34
85
  message = f"{identifier_column.title()} {identifier_value} closed with reason: {reason}"
35
- self._log_to_database(spider, "SPIDER_CLOSED", message)
86
+ self._log_to_database(spider, "INFO", message)
87
+ # Remove the DB log handler
88
+ if self._db_log_handler:
89
+ self._db_log_handler.flush() # Flush any remaining logs
90
+ logging.getLogger().removeHandler(self._db_log_handler)
91
+ self._db_log_handler = None
92
+ self._spider = None
36
93
 
37
94
  def spider_error(self, failure, response, spider):
38
95
  """Called when spider encounters an error"""
39
96
  message = f"Spider error: {str(failure.value)} on {response.url if response else 'unknown URL'}"
40
- self._log_to_database(spider, "SPIDER_ERROR", message)
97
+ self._log_to_database(spider, "ERROR", message)
41
98
 
42
99
  def item_dropped(self, item, response, spider, exception):
43
100
  """Called when an item is dropped"""
44
101
  message = f"Item dropped: {str(exception)} from {response.url if response else 'unknown URL'}"
45
- self._log_to_database(spider, "ITEM_DROPPED", message)
102
+ self._log_to_database(spider, "INFO", message)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scrapy_item_ingest
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Scrapy extension for database ingestion with job/spider tracking
5
5
  Home-page: https://github.com/fawadss1/scrapy_item_ingest
6
6
  Author: Fawad Ali
@@ -122,7 +122,7 @@ For support and questions:
122
122
 
123
123
  ## Changelog
124
124
 
125
- ### v0.1.0 (Current)
125
+ ### v0.1.1 (Current)
126
126
 
127
127
  - Initial release
128
128
  - Core pipeline functionality for items, requests, and logs
@@ -1,4 +1,4 @@
1
- scrapy_item_ingest/__init__.py,sha256=2Cj2dnPpsGOTGH95ihMAdwASfT-Nhw3i4M5zfxLfgaw,1507
1
+ scrapy_item_ingest/__init__.py,sha256=7kzpn5xDcnAf05mxtrGs0YJVzExkPuSZ9F2rpVKGUK0,1507
2
2
  scrapy_item_ingest/config/__init__.py,sha256=Foyt52_KDRIoDZtSH5ttcWxQXCOUgzebo4IGCPQwriY,55
3
3
  scrapy_item_ingest/config/settings.py,sha256=5GFQAqRf-6oc4KaMdMGuzLdtSP6fmn67anzGnKpejTI,2619
4
4
  scrapy_item_ingest/database/__init__.py,sha256=-D9cfI8Hrap74UkIUmcOZ-ikAZ8HKSswZAZMBtjq69A,50
@@ -6,7 +6,7 @@ scrapy_item_ingest/database/connection.py,sha256=bvSTCQfgBMcuKu-VzMCwMtSNBORzeea
6
6
  scrapy_item_ingest/database/schema.py,sha256=2HcBbW3VIWva59YCxyAinwZQDidFuyU5zuOCdCwBZUI,2866
7
7
  scrapy_item_ingest/extensions/__init__.py,sha256=G8xe0Bssf4jFvi3D1gNyOpylaDHlo-RKHEX9_tIB2f8,51
8
8
  scrapy_item_ingest/extensions/base.py,sha256=aodB_V47O8ihox2VdDizclAhQ6VonAUZ4lIOitub7kw,3192
9
- scrapy_item_ingest/extensions/logging.py,sha256=pI-MlQfNLHFmNuOSp60SfFAGRxJwPwurZp1xt88PeWU,2017
9
+ scrapy_item_ingest/extensions/logging.py,sha256=gJGL5Ozl5kqivND1jSrw7ywilEt4Nz-rsYJiec4ZMKU,3948
10
10
  scrapy_item_ingest/pipelines/__init__.py,sha256=NvbUeLCwjFPvVaSTzdnN6LkToJ1ISAM91EmVero9FXo,50
11
11
  scrapy_item_ingest/pipelines/base.py,sha256=C6lk37lhNr6oADXofi4aqnN0cZ9u1KrWxEZMp6Oo7oA,1783
12
12
  scrapy_item_ingest/pipelines/items.py,sha256=FcgzmuFJ3qPVVQDZv9OjEPjlSbbW5MZykMmhAcdZ4Tc,1487
@@ -15,9 +15,9 @@ scrapy_item_ingest/pipelines/requests.py,sha256=3Wyzx6kgf7B_gg2DC0jhekCPSIEaJiRJ
15
15
  scrapy_item_ingest/utils/__init__.py,sha256=xuzfL-u3NkFElIrBygQISYv0CKMdSVvreuL16JXZMRM,49
16
16
  scrapy_item_ingest/utils/fingerprint.py,sha256=Qdby72nLNQp4-sxL51RM85MuxwFJHxmuYDbQv1c7hPc,855
17
17
  scrapy_item_ingest/utils/serialization.py,sha256=iKGhWnVwMKLKZ63kek4Hov9ESy9igA13CuOfDRD1W-M,942
18
- scrapy_item_ingest-0.1.0.dist-info/licenses/LICENSE,sha256=DhJQ4_j45c_DWghISLKmJshcLvX_Pr7QXaahe2iRMNo,1087
19
- scrapy_item_ingest-0.1.0.dist-info/METADATA,sha256=eiZF00MlFqNBdExA4kW2i4C_p_TvLsd4sLAsFE4uiD4,5868
20
- scrapy_item_ingest-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
21
- scrapy_item_ingest-0.1.0.dist-info/entry_points.txt,sha256=WKFpo9Dy0qX1S1PT8NvIHqZmSxBCgyAM480LnLR8S1E,172
22
- scrapy_item_ingest-0.1.0.dist-info/top_level.txt,sha256=bu2ekFWcSH0ANdc8oGDdmZXaSC6kNuhtC-AggLsUQCU,19
23
- scrapy_item_ingest-0.1.0.dist-info/RECORD,,
18
+ scrapy_item_ingest-0.1.1.dist-info/licenses/LICENSE,sha256=DhJQ4_j45c_DWghISLKmJshcLvX_Pr7QXaahe2iRMNo,1087
19
+ scrapy_item_ingest-0.1.1.dist-info/METADATA,sha256=zwUBfROB8fYjYvgS-m36pSfexc4oE4FOe482jBr-cos,5868
20
+ scrapy_item_ingest-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
21
+ scrapy_item_ingest-0.1.1.dist-info/entry_points.txt,sha256=WKFpo9Dy0qX1S1PT8NvIHqZmSxBCgyAM480LnLR8S1E,172
22
+ scrapy_item_ingest-0.1.1.dist-info/top_level.txt,sha256=bu2ekFWcSH0ANdc8oGDdmZXaSC6kNuhtC-AggLsUQCU,19
23
+ scrapy_item_ingest-0.1.1.dist-info/RECORD,,