scrapy-item-ingest 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of scrapy-item-ingest might be problematic. Click here for more details.
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/PKG-INFO +2 -2
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/README.md +1 -1
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/__init__.py +1 -1
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/config/settings.py +4 -6
- scrapy_item_ingest-0.1.2/scrapy_item_ingest/extensions/logging.py +102 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest.egg-info/PKG-INFO +2 -2
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/setup.py +2 -2
- scrapy_item_ingest-0.1.0/scrapy_item_ingest/extensions/logging.py +0 -45
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/LICENSE +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/config/__init__.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/database/__init__.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/database/connection.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/database/schema.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/extensions/__init__.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/extensions/base.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/pipelines/__init__.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/pipelines/base.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/pipelines/items.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/pipelines/main.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/pipelines/requests.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/utils/__init__.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/utils/fingerprint.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/utils/serialization.py +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest.egg-info/SOURCES.txt +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest.egg-info/dependency_links.txt +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest.egg-info/entry_points.txt +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest.egg-info/not-zip-safe +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest.egg-info/requires.txt +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest.egg-info/top_level.txt +0 -0
- {scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scrapy_item_ingest
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Scrapy extension for database ingestion with job/spider tracking
|
|
5
5
|
Home-page: https://github.com/fawadss1/scrapy_item_ingest
|
|
6
6
|
Author: Fawad Ali
|
|
@@ -122,7 +122,7 @@ For support and questions:
|
|
|
122
122
|
|
|
123
123
|
## Changelog
|
|
124
124
|
|
|
125
|
-
### v0.1.
|
|
125
|
+
### v0.1.2 (Current)
|
|
126
126
|
|
|
127
127
|
- Initial release
|
|
128
128
|
- Core pipeline functionality for items, requests, and logs
|
|
@@ -54,12 +54,10 @@ class Settings:
|
|
|
54
54
|
# Always return JOB_ID or fallback to None (spider name will be used)
|
|
55
55
|
return self.crawler_settings.get('JOB_ID', None)
|
|
56
56
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
else:
|
|
62
|
-
return "job_id" # Use job_id column when using existing tables
|
|
57
|
+
@staticmethod
|
|
58
|
+
def get_identifier_column():
|
|
59
|
+
"""Get the identifier column name"""
|
|
60
|
+
return "job_id"
|
|
63
61
|
|
|
64
62
|
def get_identifier_value(self, spider):
|
|
65
63
|
"""Get the identifier value with smart fallback"""
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Logging extension for tracking spider events.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
import threading
|
|
6
|
+
from scrapy import signals
|
|
7
|
+
from .base import BaseExtension
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DatabaseLogHandler(logging.Handler):
|
|
13
|
+
"""Custom logging handler to save all log records to the database in batches."""
|
|
14
|
+
_local = threading.local()
|
|
15
|
+
BATCH_SIZE = 100
|
|
16
|
+
|
|
17
|
+
def __init__(self, extension, spider):
|
|
18
|
+
super().__init__()
|
|
19
|
+
self.extension = extension
|
|
20
|
+
self.spider = spider
|
|
21
|
+
self._buffer = []
|
|
22
|
+
|
|
23
|
+
def emit(self, record):
|
|
24
|
+
if getattr(self._local, 'in_emit', False):
|
|
25
|
+
return # Prevent recursion
|
|
26
|
+
self._local.in_emit = True
|
|
27
|
+
try:
|
|
28
|
+
# Format the log message
|
|
29
|
+
msg = self.format(record)
|
|
30
|
+
level = record.levelname
|
|
31
|
+
self._buffer.append((self.spider, level, msg))
|
|
32
|
+
if len(self._buffer) >= self.BATCH_SIZE:
|
|
33
|
+
self.flush()
|
|
34
|
+
except Exception:
|
|
35
|
+
# Avoid infinite recursion if logging fails
|
|
36
|
+
pass
|
|
37
|
+
finally:
|
|
38
|
+
self._local.in_emit = False
|
|
39
|
+
|
|
40
|
+
def flush(self):
|
|
41
|
+
if not self._buffer:
|
|
42
|
+
return
|
|
43
|
+
try:
|
|
44
|
+
for spider, level, msg in self._buffer:
|
|
45
|
+
self.extension._log_to_database(spider, level, msg)
|
|
46
|
+
except Exception:
|
|
47
|
+
pass
|
|
48
|
+
finally:
|
|
49
|
+
self._buffer.clear()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class LoggingExtension(BaseExtension):
|
|
53
|
+
"""Extension for logging spider events to database"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, settings):
|
|
56
|
+
super().__init__(settings)
|
|
57
|
+
self._db_log_handler = None
|
|
58
|
+
self._spider = None
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def from_crawler(cls, crawler):
|
|
62
|
+
"""Create extension instance from crawler"""
|
|
63
|
+
ext = super().from_crawler(crawler)
|
|
64
|
+
# Connect to spider signals
|
|
65
|
+
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
|
|
66
|
+
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
|
|
67
|
+
crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
|
|
68
|
+
crawler.signals.connect(ext.item_dropped, signal=signals.item_dropped)
|
|
69
|
+
return ext
|
|
70
|
+
|
|
71
|
+
def spider_opened(self, spider):
|
|
72
|
+
"""Called when spider is opened"""
|
|
73
|
+
identifier_column, identifier_value = self.get_identifier_info(spider)
|
|
74
|
+
message = f"{identifier_column.title()} {identifier_value} started"
|
|
75
|
+
self._log_to_database(spider, "INFO", message)
|
|
76
|
+
# Attach custom DB log handler to root logger
|
|
77
|
+
self._spider = spider
|
|
78
|
+
self._db_log_handler = DatabaseLogHandler(self, spider)
|
|
79
|
+
self._db_log_handler.setLevel(logging.DEBUG)
|
|
80
|
+
logging.getLogger().addHandler(self._db_log_handler)
|
|
81
|
+
|
|
82
|
+
def spider_closed(self, spider, reason):
|
|
83
|
+
"""Called when spider is closed"""
|
|
84
|
+
identifier_column, identifier_value = self.get_identifier_info(spider)
|
|
85
|
+
message = f"{identifier_column.title()} {identifier_value} closed with reason: {reason}"
|
|
86
|
+
self._log_to_database(spider, "INFO", message)
|
|
87
|
+
# Remove the DB log handler
|
|
88
|
+
if self._db_log_handler:
|
|
89
|
+
self._db_log_handler.flush() # Flush any remaining logs
|
|
90
|
+
logging.getLogger().removeHandler(self._db_log_handler)
|
|
91
|
+
self._db_log_handler = None
|
|
92
|
+
self._spider = None
|
|
93
|
+
|
|
94
|
+
def spider_error(self, failure, response, spider):
|
|
95
|
+
"""Called when spider encounters an error"""
|
|
96
|
+
message = f"Spider error: {str(failure.value)} on {response.url if response else 'unknown URL'}"
|
|
97
|
+
self._log_to_database(spider, "ERROR", message)
|
|
98
|
+
|
|
99
|
+
def item_dropped(self, item, response, spider, exception):
|
|
100
|
+
"""Called when an item is dropped"""
|
|
101
|
+
message = f"Item dropped: {str(exception)} from {response.url if response else 'unknown URL'}"
|
|
102
|
+
self._log_to_database(spider, "INFO", message)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scrapy_item_ingest
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Scrapy extension for database ingestion with job/spider tracking
|
|
5
5
|
Home-page: https://github.com/fawadss1/scrapy_item_ingest
|
|
6
6
|
Author: Fawad Ali
|
|
@@ -122,7 +122,7 @@ For support and questions:
|
|
|
122
122
|
|
|
123
123
|
## Changelog
|
|
124
124
|
|
|
125
|
-
### v0.1.
|
|
125
|
+
### v0.1.2 (Current)
|
|
126
126
|
|
|
127
127
|
- Initial release
|
|
128
128
|
- Core pipeline functionality for items, requests, and logs
|
|
@@ -2,14 +2,14 @@ from setuptools import setup, find_packages
|
|
|
2
2
|
|
|
3
3
|
# Read the README file for long description
|
|
4
4
|
try:
|
|
5
|
-
with open("README.md",
|
|
5
|
+
with open("README.md", encoding="utf-8") as fh:
|
|
6
6
|
long_description = fh.read()
|
|
7
7
|
except FileNotFoundError:
|
|
8
8
|
long_description = "A comprehensive Scrapy extension for ingesting scraped items, requests, and logs into PostgreSQL databases."
|
|
9
9
|
|
|
10
10
|
setup(
|
|
11
11
|
name="scrapy_item_ingest",
|
|
12
|
-
version="0.1.
|
|
12
|
+
version="0.1.2",
|
|
13
13
|
description="Scrapy extension for database ingestion with job/spider tracking",
|
|
14
14
|
long_description=long_description,
|
|
15
15
|
long_description_content_type="text/markdown",
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Logging extension for tracking spider events.
|
|
3
|
-
"""
|
|
4
|
-
import logging
|
|
5
|
-
from scrapy import signals
|
|
6
|
-
from .base import BaseExtension
|
|
7
|
-
|
|
8
|
-
logger = logging.getLogger(__name__)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class LoggingExtension(BaseExtension):
|
|
12
|
-
"""Extension for logging spider events to database"""
|
|
13
|
-
|
|
14
|
-
@classmethod
|
|
15
|
-
def from_crawler(cls, crawler):
|
|
16
|
-
"""Create extension instance from crawler"""
|
|
17
|
-
ext = super().from_crawler(crawler)
|
|
18
|
-
# Connect to spider signals
|
|
19
|
-
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
|
|
20
|
-
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
|
|
21
|
-
crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
|
|
22
|
-
crawler.signals.connect(ext.item_dropped, signal=signals.item_dropped)
|
|
23
|
-
return ext
|
|
24
|
-
|
|
25
|
-
def spider_opened(self, spider):
|
|
26
|
-
"""Called when spider is opened"""
|
|
27
|
-
identifier_column, identifier_value = self.get_identifier_info(spider)
|
|
28
|
-
message = f"{identifier_column.title()} {identifier_value} started"
|
|
29
|
-
self._log_to_database(spider, "SPIDER_OPENED", message)
|
|
30
|
-
|
|
31
|
-
def spider_closed(self, spider, reason):
|
|
32
|
-
"""Called when spider is closed"""
|
|
33
|
-
identifier_column, identifier_value = self.get_identifier_info(spider)
|
|
34
|
-
message = f"{identifier_column.title()} {identifier_value} closed with reason: {reason}"
|
|
35
|
-
self._log_to_database(spider, "SPIDER_CLOSED", message)
|
|
36
|
-
|
|
37
|
-
def spider_error(self, failure, response, spider):
|
|
38
|
-
"""Called when spider encounters an error"""
|
|
39
|
-
message = f"Spider error: {str(failure.value)} on {response.url if response else 'unknown URL'}"
|
|
40
|
-
self._log_to_database(spider, "SPIDER_ERROR", message)
|
|
41
|
-
|
|
42
|
-
def item_dropped(self, item, response, spider, exception):
|
|
43
|
-
"""Called when an item is dropped"""
|
|
44
|
-
message = f"Item dropped: {str(exception)} from {response.url if response else 'unknown URL'}"
|
|
45
|
-
self._log_to_database(spider, "ITEM_DROPPED", message)
|
|
File without changes
|
|
File without changes
|
{scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/database/__init__.py
RENAMED
|
File without changes
|
{scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/database/connection.py
RENAMED
|
File without changes
|
|
File without changes
|
{scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/extensions/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/pipelines/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/pipelines/requests.py
RENAMED
|
File without changes
|
|
File without changes
|
{scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/utils/fingerprint.py
RENAMED
|
File without changes
|
{scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest/utils/serialization.py
RENAMED
|
File without changes
|
{scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest.egg-info/entry_points.txt
RENAMED
|
File without changes
|
{scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest.egg-info/not-zip-safe
RENAMED
|
File without changes
|
{scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest.egg-info/requires.txt
RENAMED
|
File without changes
|
{scrapy_item_ingest-0.1.0 → scrapy_item_ingest-0.1.2}/scrapy_item_ingest.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|