scrapy-item-ingest 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of scrapy-item-ingest might be problematic. Click here for more details.

Files changed (30) hide show
  1. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/PKG-INFO +2 -2
  2. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/README.md +1 -1
  3. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/__init__.py +1 -1
  4. scrapy_item_ingest-0.1.1/scrapy_item_ingest/extensions/logging.py +102 -0
  5. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest.egg-info/PKG-INFO +2 -2
  6. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/setup.py +2 -2
  7. scrapy_item_ingest-0.1.0/scrapy_item_ingest/extensions/logging.py +0 -45
  8. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/LICENSE +0 -0
  9. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/config/__init__.py +0 -0
  10. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/config/settings.py +0 -0
  11. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/database/__init__.py +0 -0
  12. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/database/connection.py +0 -0
  13. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/database/schema.py +0 -0
  14. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/extensions/__init__.py +0 -0
  15. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/extensions/base.py +0 -0
  16. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/pipelines/__init__.py +0 -0
  17. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/pipelines/base.py +0 -0
  18. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/pipelines/items.py +0 -0
  19. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/pipelines/main.py +0 -0
  20. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/pipelines/requests.py +0 -0
  21. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/utils/__init__.py +0 -0
  22. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/utils/fingerprint.py +0 -0
  23. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest/utils/serialization.py +0 -0
  24. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest.egg-info/SOURCES.txt +0 -0
  25. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest.egg-info/dependency_links.txt +0 -0
  26. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest.egg-info/entry_points.txt +0 -0
  27. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest.egg-info/not-zip-safe +0 -0
  28. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest.egg-info/requires.txt +0 -0
  29. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/scrapy_item_ingest.egg-info/top_level.txt +0 -0
  30. {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scrapy_item_ingest
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Scrapy extension for database ingestion with job/spider tracking
5
5
  Home-page: https://github.com/fawadss1/scrapy_item_ingest
6
6
  Author: Fawad Ali
@@ -122,7 +122,7 @@ For support and questions:
122
122
 
123
123
  ## Changelog
124
124
 
125
- ### v0.1.0 (Current)
125
+ ### v0.1.1 (Current)
126
126
 
127
127
  - Initial release
128
128
  - Core pipeline functionality for items, requests, and logs
@@ -59,7 +59,7 @@ For support and questions:
59
59
 
60
60
  ## Changelog
61
61
 
62
- ### v0.1.0 (Current)
62
+ ### v0.1.1 (Current)
63
63
 
64
64
  - Initial release
65
65
  - Core pipeline functionality for items, requests, and logs
@@ -12,7 +12,7 @@ Main Components:
12
12
  - RequestsPipeline: Standalone requests tracking pipeline
13
13
  """
14
14
 
15
- __version__ = "0.1.0"
15
+ __version__ = "0.1.1"
16
16
  __author__ = "Fawad Ali"
17
17
  __description__ = "Scrapy extension for database ingestion with job/spider tracking"
18
18
 
@@ -0,0 +1,102 @@
1
+ """
2
+ Logging extension for tracking spider events.
3
+ """
4
+ import logging
5
+ import threading
6
+ from scrapy import signals
7
+ from .base import BaseExtension
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class DatabaseLogHandler(logging.Handler):
13
+ """Custom logging handler to save all log records to the database in batches."""
14
+ _local = threading.local()
15
+ BATCH_SIZE = 100
16
+
17
+ def __init__(self, extension, spider):
18
+ super().__init__()
19
+ self.extension = extension
20
+ self.spider = spider
21
+ self._buffer = []
22
+
23
+ def emit(self, record):
24
+ if getattr(self._local, 'in_emit', False):
25
+ return # Prevent recursion
26
+ self._local.in_emit = True
27
+ try:
28
+ # Format the log message
29
+ msg = self.format(record)
30
+ level = record.levelname
31
+ self._buffer.append((self.spider, level, msg))
32
+ if len(self._buffer) >= self.BATCH_SIZE:
33
+ self.flush()
34
+ except Exception:
35
+ # Avoid infinite recursion if logging fails
36
+ pass
37
+ finally:
38
+ self._local.in_emit = False
39
+
40
+ def flush(self):
41
+ if not self._buffer:
42
+ return
43
+ try:
44
+ for spider, level, msg in self._buffer:
45
+ self.extension._log_to_database(spider, level, msg)
46
+ except Exception:
47
+ pass
48
+ finally:
49
+ self._buffer.clear()
50
+
51
+
52
+ class LoggingExtension(BaseExtension):
53
+ """Extension for logging spider events to database"""
54
+
55
+ def __init__(self, settings):
56
+ super().__init__(settings)
57
+ self._db_log_handler = None
58
+ self._spider = None
59
+
60
+ @classmethod
61
+ def from_crawler(cls, crawler):
62
+ """Create extension instance from crawler"""
63
+ ext = super().from_crawler(crawler)
64
+ # Connect to spider signals
65
+ crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
66
+ crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
67
+ crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
68
+ crawler.signals.connect(ext.item_dropped, signal=signals.item_dropped)
69
+ return ext
70
+
71
+ def spider_opened(self, spider):
72
+ """Called when spider is opened"""
73
+ identifier_column, identifier_value = self.get_identifier_info(spider)
74
+ message = f"{identifier_column.title()} {identifier_value} started"
75
+ self._log_to_database(spider, "INFO", message)
76
+ # Attach custom DB log handler to root logger
77
+ self._spider = spider
78
+ self._db_log_handler = DatabaseLogHandler(self, spider)
79
+ self._db_log_handler.setLevel(logging.DEBUG)
80
+ logging.getLogger().addHandler(self._db_log_handler)
81
+
82
+ def spider_closed(self, spider, reason):
83
+ """Called when spider is closed"""
84
+ identifier_column, identifier_value = self.get_identifier_info(spider)
85
+ message = f"{identifier_column.title()} {identifier_value} closed with reason: {reason}"
86
+ self._log_to_database(spider, "INFO", message)
87
+ # Remove the DB log handler
88
+ if self._db_log_handler:
89
+ self._db_log_handler.flush() # Flush any remaining logs
90
+ logging.getLogger().removeHandler(self._db_log_handler)
91
+ self._db_log_handler = None
92
+ self._spider = None
93
+
94
+ def spider_error(self, failure, response, spider):
95
+ """Called when spider encounters an error"""
96
+ message = f"Spider error: {str(failure.value)} on {response.url if response else 'unknown URL'}"
97
+ self._log_to_database(spider, "ERROR", message)
98
+
99
+ def item_dropped(self, item, response, spider, exception):
100
+ """Called when an item is dropped"""
101
+ message = f"Item dropped: {str(exception)} from {response.url if response else 'unknown URL'}"
102
+ self._log_to_database(spider, "INFO", message)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scrapy_item_ingest
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Scrapy extension for database ingestion with job/spider tracking
5
5
  Home-page: https://github.com/fawadss1/scrapy_item_ingest
6
6
  Author: Fawad Ali
@@ -122,7 +122,7 @@ For support and questions:
122
122
 
123
123
  ## Changelog
124
124
 
125
- ### v0.1.0 (Current)
125
+ ### v0.1.1 (Current)
126
126
 
127
127
  - Initial release
128
128
  - Core pipeline functionality for items, requests, and logs
@@ -2,14 +2,14 @@ from setuptools import setup, find_packages
2
2
 
3
3
  # Read the README file for long description
4
4
  try:
5
- with open("README.md", "r", encoding="utf-8") as fh:
5
+ with open("README.md", encoding="utf-8") as fh:
6
6
  long_description = fh.read()
7
7
  except FileNotFoundError:
8
8
  long_description = "A comprehensive Scrapy extension for ingesting scraped items, requests, and logs into PostgreSQL databases."
9
9
 
10
10
  setup(
11
11
  name="scrapy_item_ingest",
12
- version="0.1.0",
12
+ version="0.1.1",
13
13
  description="Scrapy extension for database ingestion with job/spider tracking",
14
14
  long_description=long_description,
15
15
  long_description_content_type="text/markdown",
@@ -1,45 +0,0 @@
1
- """
2
- Logging extension for tracking spider events.
3
- """
4
- import logging
5
- from scrapy import signals
6
- from .base import BaseExtension
7
-
8
- logger = logging.getLogger(__name__)
9
-
10
-
11
- class LoggingExtension(BaseExtension):
12
- """Extension for logging spider events to database"""
13
-
14
- @classmethod
15
- def from_crawler(cls, crawler):
16
- """Create extension instance from crawler"""
17
- ext = super().from_crawler(crawler)
18
- # Connect to spider signals
19
- crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
20
- crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
21
- crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
22
- crawler.signals.connect(ext.item_dropped, signal=signals.item_dropped)
23
- return ext
24
-
25
- def spider_opened(self, spider):
26
- """Called when spider is opened"""
27
- identifier_column, identifier_value = self.get_identifier_info(spider)
28
- message = f"{identifier_column.title()} {identifier_value} started"
29
- self._log_to_database(spider, "SPIDER_OPENED", message)
30
-
31
- def spider_closed(self, spider, reason):
32
- """Called when spider is closed"""
33
- identifier_column, identifier_value = self.get_identifier_info(spider)
34
- message = f"{identifier_column.title()} {identifier_value} closed with reason: {reason}"
35
- self._log_to_database(spider, "SPIDER_CLOSED", message)
36
-
37
- def spider_error(self, failure, response, spider):
38
- """Called when spider encounters an error"""
39
- message = f"Spider error: {str(failure.value)} on {response.url if response else 'unknown URL'}"
40
- self._log_to_database(spider, "SPIDER_ERROR", message)
41
-
42
- def item_dropped(self, item, response, spider, exception):
43
- """Called when an item is dropped"""
44
- message = f"Item dropped: {str(exception)} from {response.url if response else 'unknown URL'}"
45
- self._log_to_database(spider, "ITEM_DROPPED", message)