scrapy-item-ingest 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapy_item_ingest/__init__.py +48 -0
- scrapy_item_ingest/config/__init__.py +2 -0
- scrapy_item_ingest/config/settings.py +134 -0
- scrapy_item_ingest/database/__init__.py +2 -0
- scrapy_item_ingest/database/connection.py +150 -0
- scrapy_item_ingest/database/schema.py +79 -0
- scrapy_item_ingest/extensions/__init__.py +2 -0
- scrapy_item_ingest/extensions/base.py +81 -0
- scrapy_item_ingest/extensions/logging.py +129 -0
- scrapy_item_ingest/pipelines/__init__.py +2 -0
- scrapy_item_ingest/pipelines/base.py +48 -0
- scrapy_item_ingest/pipelines/items.py +38 -0
- scrapy_item_ingest/pipelines/main.py +42 -0
- scrapy_item_ingest/pipelines/requests.py +138 -0
- scrapy_item_ingest/utils/__init__.py +2 -0
- scrapy_item_ingest/utils/fingerprint.py +21 -0
- scrapy_item_ingest/utils/serialization.py +9 -0
- scrapy_item_ingest/utils/time.py +19 -0
- scrapy_item_ingest-0.2.4.dist-info/METADATA +131 -0
- scrapy_item_ingest-0.2.4.dist-info/RECORD +24 -0
- scrapy_item_ingest-0.2.4.dist-info/WHEEL +5 -0
- scrapy_item_ingest-0.2.4.dist-info/entry_points.txt +5 -0
- scrapy_item_ingest-0.2.4.dist-info/licenses/LICENSE +21 -0
- scrapy_item_ingest-0.2.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scrapy_item_ingest - A Scrapy extension for ingesting items and requests into databases.
|
|
3
|
+
|
|
4
|
+
This package provides pipelines and extensions for storing scraped data, tracking requests,
|
|
5
|
+
and logging spider events to PostgreSQL databases with support for both spider-based and
|
|
6
|
+
job-based identification.
|
|
7
|
+
|
|
8
|
+
Main Components:
|
|
9
|
+
- DbInsertPipeline: Combined pipeline for items and requests
|
|
10
|
+
- LoggingExtension: Extension for logging spider events
|
|
11
|
+
- ItemsPipeline: Standalone items processing pipeline
|
|
12
|
+
- RequestsPipeline: Standalone requests tracking pipeline
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
__version__ = "0.2.4"
|
|
16
|
+
__author__ = "Fawad Ali"
|
|
17
|
+
__description__ = "Scrapy extension for database ingestion with job/spider tracking"
|
|
18
|
+
|
|
19
|
+
# Import main classes directly from organized modules
|
|
20
|
+
from .pipelines.main import DbInsertPipeline
|
|
21
|
+
from .extensions.logging import LoggingExtension
|
|
22
|
+
|
|
23
|
+
# Import individual components for advanced users
|
|
24
|
+
from .pipelines.items import ItemsPipeline
|
|
25
|
+
from .pipelines.requests import RequestsPipeline
|
|
26
|
+
|
|
27
|
+
# Import configuration utilities
|
|
28
|
+
from .config.settings import Settings, validate_settings
|
|
29
|
+
|
|
30
|
+
# Define what gets imported with "from scrapy_item_ingest import *"
|
|
31
|
+
__all__ = [
|
|
32
|
+
# Main classes (most commonly used)
|
|
33
|
+
'DbInsertPipeline',
|
|
34
|
+
'LoggingExtension',
|
|
35
|
+
|
|
36
|
+
# Individual components
|
|
37
|
+
'ItemsPipeline',
|
|
38
|
+
'RequestsPipeline',
|
|
39
|
+
|
|
40
|
+
# Configuration
|
|
41
|
+
'Settings',
|
|
42
|
+
'validate_settings',
|
|
43
|
+
|
|
44
|
+
# Package metadata
|
|
45
|
+
'__version__',
|
|
46
|
+
'__author__',
|
|
47
|
+
'__description__',
|
|
48
|
+
]
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module for managing and validating crawler settings.
|
|
3
|
+
|
|
4
|
+
This module provides utility classes and functions for handling the settings of
|
|
5
|
+
a crawler, including the database configuration, operational parameters, and
|
|
6
|
+
customizable options. The primary class `Settings` provides an interface for
|
|
7
|
+
accessing settings dynamically and offers default fallbacks where values are
|
|
8
|
+
not explicitly defined. Utility function `validate_settings` ensures critical
|
|
9
|
+
configuration is present.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Settings:
|
|
14
|
+
"""
|
|
15
|
+
Handles settings configuration for crawlers, providing access to default values,
|
|
16
|
+
database table names, and other operational parameters defined in crawler settings.
|
|
17
|
+
|
|
18
|
+
This class facilitates the standardized management and retrieval of settings that
|
|
19
|
+
are essential for database operations and crawler configurations. Its purpose
|
|
20
|
+
is to provide default fallbacks and dynamically adapt to user-specified settings.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
DEFAULT_ITEMS_TABLE = 'job_items'
|
|
24
|
+
DEFAULT_REQUESTS_TABLE = 'job_requests'
|
|
25
|
+
DEFAULT_LOGS_TABLE = 'job_logs'
|
|
26
|
+
DEFAULT_TIMEZONE = "Asia/Karachi"
|
|
27
|
+
|
|
28
|
+
def __init__(self, crawler_settings):
|
|
29
|
+
self.crawler_settings = crawler_settings
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def db_url(self):
|
|
33
|
+
"""
|
|
34
|
+
Provides access to the database URL from the crawler settings.
|
|
35
|
+
|
|
36
|
+
This property is used to retrieve the database URL defined in the
|
|
37
|
+
crawler's settings. It is helpful when a database configuration
|
|
38
|
+
needs to be accessed dynamically.
|
|
39
|
+
|
|
40
|
+
:return: The database URL as defined in the crawler's configuration
|
|
41
|
+
:rtype: str or None
|
|
42
|
+
"""
|
|
43
|
+
return self.crawler_settings.get('DB_URL')
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def db_type(self):
|
|
47
|
+
"""
|
|
48
|
+
Retrieves the database type from the crawler settings.
|
|
49
|
+
|
|
50
|
+
This property fetches the value assigned to the key `DB_TYPE` within
|
|
51
|
+
the `crawler_settings`. Defaults to 'postgres' if the key is not set.
|
|
52
|
+
|
|
53
|
+
:return: The database type as a string.
|
|
54
|
+
:rtype: str
|
|
55
|
+
"""
|
|
56
|
+
return self.crawler_settings.get('DB_TYPE', 'postgres')
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def db_items_table(self):
|
|
60
|
+
"""Return static table name for items"""
|
|
61
|
+
return self.crawler_settings.get('ITEMS_TABLE', self.DEFAULT_ITEMS_TABLE)
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def db_requests_table(self):
|
|
65
|
+
"""
|
|
66
|
+
This property fetches the name of the database table used to store request
|
|
67
|
+
information. It retrieves the value from crawler settings if defined;
|
|
68
|
+
otherwise, it defaults to the value of `DEFAULT_REQUESTS_TABLE`.
|
|
69
|
+
|
|
70
|
+
:return: Name of the database table for storing requests.
|
|
71
|
+
:rtype: str
|
|
72
|
+
"""
|
|
73
|
+
return self.crawler_settings.get('REQUESTS_TABLE', self.DEFAULT_REQUESTS_TABLE)
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def db_logs_table(self):
|
|
77
|
+
"""
|
|
78
|
+
Retrieve the name of the database logs table.
|
|
79
|
+
|
|
80
|
+
This property fetches the value of the database logs table name
|
|
81
|
+
provided in the crawler settings. If the value is not explicitly
|
|
82
|
+
defined in the settings, it falls back to the default logs table.
|
|
83
|
+
|
|
84
|
+
:return: The name of the database logs table.
|
|
85
|
+
:rtype: Str
|
|
86
|
+
"""
|
|
87
|
+
return self.crawler_settings.get('LOGS_TABLE', self.DEFAULT_LOGS_TABLE)
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def create_tables(self):
|
|
91
|
+
"""
|
|
92
|
+
Retrieve the setting for creating database tables from crawler settings.
|
|
93
|
+
|
|
94
|
+
This property fetches the value of the 'CREATE_TABLES' option from the crawler
|
|
95
|
+
settings. If the option is not specified in the settings, it defaults to True.
|
|
96
|
+
|
|
97
|
+
:return: Boolean value indicating whether to create tables.
|
|
98
|
+
:rtype: Bool
|
|
99
|
+
"""
|
|
100
|
+
return self.crawler_settings.getbool('CREATE_TABLES', True)
|
|
101
|
+
|
|
102
|
+
def get_tz(self):
|
|
103
|
+
"""
|
|
104
|
+
Return the timezone string for the project.
|
|
105
|
+
This checks for a 'TIMEZONE' setting in the crawler settings and falls back to the default ('Asia/Karachi').
|
|
106
|
+
Returns:
|
|
107
|
+
str: The timezone string (e.g., 'Asia/Karachi').
|
|
108
|
+
"""
|
|
109
|
+
return self.crawler_settings.get('TIMEZONE', self.DEFAULT_TIMEZONE)
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def get_identifier_column():
|
|
113
|
+
"""Get the identifier column name"""
|
|
114
|
+
return "job_id"
|
|
115
|
+
|
|
116
|
+
def get_identifier_value(self, spider):
|
|
117
|
+
"""Get the identifier value with smart fallback"""
|
|
118
|
+
job_id = self.crawler_settings.get('JOB_ID', None)
|
|
119
|
+
|
|
120
|
+
if self.create_tables:
|
|
121
|
+
# When creating tables, use JOB_ID if provided, else spider name
|
|
122
|
+
return job_id if job_id else spider.name
|
|
123
|
+
else:
|
|
124
|
+
# When using existing tables, use JOB_ID if provided, else spider name
|
|
125
|
+
return job_id if job_id else spider.name
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def validate_settings(settings):
|
|
129
|
+
"""Validate configuration settings"""
|
|
130
|
+
if not settings.db_url:
|
|
131
|
+
raise ValueError("DB_URL must be set in settings")
|
|
132
|
+
|
|
133
|
+
# Job ID is now optional - will use spider name as fallback
|
|
134
|
+
return True
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# scrapy_item_ingest/database/connection.py
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional, Any, Sequence
|
|
5
|
+
from urllib.parse import urlsplit, urlunsplit, quote, unquote
|
|
6
|
+
|
|
7
|
+
import psycopg2
|
|
8
|
+
from psycopg2 import OperationalError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DBConnection:
|
|
12
|
+
"""
|
|
13
|
+
PostgreSQL connection manager (singleton) with a small convenience API used by
|
|
14
|
+
pipelines and schema utilities. Supports either a DSN/URL or settings-based
|
|
15
|
+
configuration and exposes `connect/execute/commit/rollback/close` methods.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
_instance = None # Singleton instance
|
|
19
|
+
_connection = None
|
|
20
|
+
_db_url: Optional[str] = None
|
|
21
|
+
_logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
def __new__(cls, db_url: Optional[str] = None):
|
|
24
|
+
# Ensure only one instance exists (singleton) and accept optional db_url
|
|
25
|
+
if cls._instance is None:
|
|
26
|
+
cls._instance = super(DBConnection, cls).__new__(cls)
|
|
27
|
+
if db_url:
|
|
28
|
+
cls._instance._db_url = db_url
|
|
29
|
+
cls._instance._initialize_connection()
|
|
30
|
+
else:
|
|
31
|
+
# If an URL is passed later and we don't have one stored yet, keep it
|
|
32
|
+
if db_url and cls._instance._db_url is None:
|
|
33
|
+
cls._instance._db_url = db_url
|
|
34
|
+
# Do not auto-reconnect here; next use will reconnect if needed
|
|
35
|
+
return cls._instance
|
|
36
|
+
|
|
37
|
+
def _normalize_dsn(self, dsn: str) -> str:
|
|
38
|
+
"""Normalize a PostgreSQL DSN/URL by URL-encoding credentials if needed.
|
|
39
|
+
Handles passwords that mistakenly include raw '@' or '$' by treating the last
|
|
40
|
+
'@' as the boundary between credentials and host.
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
if "://" not in dsn:
|
|
44
|
+
return dsn
|
|
45
|
+
scheme, rest = dsn.split("://", 1)
|
|
46
|
+
# Separate netloc and remaining path/query/fragment
|
|
47
|
+
if "/" in rest:
|
|
48
|
+
netloc, tail = rest.split("/", 1)
|
|
49
|
+
tail = "/" + tail
|
|
50
|
+
else:
|
|
51
|
+
netloc, tail = rest, ""
|
|
52
|
+
if "@" in netloc:
|
|
53
|
+
userinfo, hostport = netloc.rsplit("@", 1)
|
|
54
|
+
if ":" in userinfo:
|
|
55
|
+
user, pwd = userinfo.split(":", 1)
|
|
56
|
+
# Encode only if password contains reserved characters
|
|
57
|
+
if any(c in pwd for c in "@:$ /\\"):
|
|
58
|
+
user_enc = quote(unquote(user), safe="")
|
|
59
|
+
pwd_enc = quote(pwd, safe="")
|
|
60
|
+
netloc = f"{user_enc}:{pwd_enc}@{hostport}"
|
|
61
|
+
return f"{scheme}://{netloc}{tail}"
|
|
62
|
+
except Exception:
|
|
63
|
+
return dsn
|
|
64
|
+
|
|
65
|
+
def _initialize_connection(self):
|
|
66
|
+
"""Initialize the PostgreSQL connection once (or reconnect if closed)."""
|
|
67
|
+
if self._connection is not None and getattr(self._connection, "closed", 0) == 0:
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
source = "unknown"
|
|
71
|
+
try:
|
|
72
|
+
if self._db_url:
|
|
73
|
+
source = "db_url"
|
|
74
|
+
dsn = self._normalize_dsn(self._db_url)
|
|
75
|
+
self._connection = psycopg2.connect(dsn)
|
|
76
|
+
else:
|
|
77
|
+
# Lazy import to avoid module-level dependency on Scrapy
|
|
78
|
+
from scrapy.utils.project import get_project_settings
|
|
79
|
+
settings = get_project_settings()
|
|
80
|
+
source = "Scrapy settings"
|
|
81
|
+
self._connection = psycopg2.connect(
|
|
82
|
+
host=settings.get("DB_HOST"),
|
|
83
|
+
port=settings.get("DB_PORT"),
|
|
84
|
+
user=settings.get("DB_USER"),
|
|
85
|
+
password=settings.get("DB_PASSWORD"),
|
|
86
|
+
dbname=settings.get("DB_NAME"),
|
|
87
|
+
)
|
|
88
|
+
self._connection.autocommit = False # manual commit per item
|
|
89
|
+
except OperationalError as e:
|
|
90
|
+
# Mask password in logs by not printing full URL; provide hint
|
|
91
|
+
self._logger.error(
|
|
92
|
+
"Failed to connect to database via %s: %s. "
|
|
93
|
+
"Verify DB settings or DSN (host, port, user, dbname).",
|
|
94
|
+
source,
|
|
95
|
+
str(e),
|
|
96
|
+
)
|
|
97
|
+
raise
|
|
98
|
+
|
|
99
|
+
# Public API expected by pipelines/schema
|
|
100
|
+
def connect(self) -> bool:
|
|
101
|
+
try:
|
|
102
|
+
self._initialize_connection()
|
|
103
|
+
return True
|
|
104
|
+
except Exception:
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
def cursor(self):
|
|
108
|
+
if self._connection is None or getattr(self._connection, "closed", 1):
|
|
109
|
+
self._initialize_connection()
|
|
110
|
+
return self._connection.cursor()
|
|
111
|
+
|
|
112
|
+
def execute(self, sql: str, params: Sequence[Any] = None):
|
|
113
|
+
"""Execute a SQL statement.
|
|
114
|
+
Returns the first row (tuple) if the statement produces a result set
|
|
115
|
+
(e.g., SELECT or INSERT ... RETURNING), otherwise returns None.
|
|
116
|
+
"""
|
|
117
|
+
with self.cursor() as cur:
|
|
118
|
+
if params is not None:
|
|
119
|
+
cur.execute(sql, params)
|
|
120
|
+
else:
|
|
121
|
+
cur.execute(sql)
|
|
122
|
+
# If the statement returns rows, fetch one for callers expecting a value
|
|
123
|
+
if cur.description is not None:
|
|
124
|
+
row = cur.fetchone()
|
|
125
|
+
return row
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
def commit(self):
|
|
129
|
+
if self._connection:
|
|
130
|
+
self._connection.commit()
|
|
131
|
+
|
|
132
|
+
def rollback(self):
|
|
133
|
+
if self._connection:
|
|
134
|
+
self._connection.rollback()
|
|
135
|
+
|
|
136
|
+
def get_connection(self):
|
|
137
|
+
"""Return the active connection (always the same one)."""
|
|
138
|
+
if self._connection is None or getattr(self._connection, "closed", 1):
|
|
139
|
+
self._initialize_connection()
|
|
140
|
+
return self._connection
|
|
141
|
+
|
|
142
|
+
def close(self):
|
|
143
|
+
"""Close connection gracefully when the spider ends."""
|
|
144
|
+
if self._connection and not self._connection.closed:
|
|
145
|
+
self._connection.close()
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# Backwards compatibility: older code imports `DatabaseConnection`
|
|
149
|
+
# Export an alias so both names work.
|
|
150
|
+
DatabaseConnection = DBConnection
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Database schema management utilities for scrapy_item_ingest.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SchemaManager:
|
|
10
|
+
"""Database schema management"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, db_connection, settings):
|
|
13
|
+
self.db = db_connection
|
|
14
|
+
self.settings = settings
|
|
15
|
+
|
|
16
|
+
def create_items_table(self):
|
|
17
|
+
"""Create items table if it doesn't exist"""
|
|
18
|
+
items_table_sql = f"""
|
|
19
|
+
CREATE TABLE IF NOT EXISTS {self.settings.db_items_table} (
|
|
20
|
+
id SERIAL PRIMARY KEY,
|
|
21
|
+
job_id VARCHAR(255),
|
|
22
|
+
item JSONB,
|
|
23
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
24
|
+
)
|
|
25
|
+
"""
|
|
26
|
+
self.db.execute(items_table_sql)
|
|
27
|
+
logger.info(f"Items table {self.settings.db_items_table} created/verified with job_id column")
|
|
28
|
+
|
|
29
|
+
def create_requests_table(self):
|
|
30
|
+
"""Create requests table if it doesn't exist"""
|
|
31
|
+
requests_table_sql = f"""
|
|
32
|
+
CREATE TABLE IF NOT EXISTS {self.settings.db_requests_table} (
|
|
33
|
+
id SERIAL PRIMARY KEY,
|
|
34
|
+
job_id VARCHAR(255),
|
|
35
|
+
url TEXT,
|
|
36
|
+
method VARCHAR(10),
|
|
37
|
+
status_code INTEGER,
|
|
38
|
+
duration FLOAT,
|
|
39
|
+
response_time FLOAT,
|
|
40
|
+
fingerprint VARCHAR(64),
|
|
41
|
+
parent_id INTEGER,
|
|
42
|
+
parent_url TEXT,
|
|
43
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
44
|
+
FOREIGN KEY (parent_id) REFERENCES {self.settings.db_requests_table}(id)
|
|
45
|
+
)
|
|
46
|
+
"""
|
|
47
|
+
self.db.execute(requests_table_sql)
|
|
48
|
+
logger.info(f"Requests table {self.settings.db_requests_table} created/verified with job_id column")
|
|
49
|
+
|
|
50
|
+
def create_logs_table(self):
|
|
51
|
+
"""Create logs table if it doesn't exist"""
|
|
52
|
+
logs_table_sql = f"""
|
|
53
|
+
CREATE TABLE IF NOT EXISTS {self.settings.db_logs_table} (
|
|
54
|
+
id SERIAL PRIMARY KEY,
|
|
55
|
+
job_id VARCHAR(255),
|
|
56
|
+
level VARCHAR(50),
|
|
57
|
+
message TEXT,
|
|
58
|
+
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
59
|
+
)
|
|
60
|
+
"""
|
|
61
|
+
self.db.execute(logs_table_sql)
|
|
62
|
+
logger.info(f"Logs table {self.settings.db_logs_table} created/verified with job_id column")
|
|
63
|
+
|
|
64
|
+
def ensure_tables_exist(self):
|
|
65
|
+
"""Create all tables if they don't exist (only if create_tables is True)"""
|
|
66
|
+
if not self.settings.create_tables:
|
|
67
|
+
logger.info("Table creation disabled. Skipping table creation.")
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
self.create_items_table()
|
|
72
|
+
self.create_requests_table()
|
|
73
|
+
self.create_logs_table()
|
|
74
|
+
self.db.commit()
|
|
75
|
+
logger.info("All tables created/verified successfully")
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f"Failed to create tables: {e}")
|
|
78
|
+
self.db.rollback()
|
|
79
|
+
raise
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base extension functionality for scrapy_item_ingest.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from scrapy_item_ingest.config.settings import Settings, validate_settings
|
|
7
|
+
from ..utils.time import get_current_datetime
|
|
8
|
+
from ..database.connection import DatabaseConnection
|
|
9
|
+
from ..database.schema import SchemaManager
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BaseExtension:
|
|
15
|
+
"""Base extension with common functionality"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, settings):
|
|
18
|
+
self.settings = settings
|
|
19
|
+
validate_settings(settings)
|
|
20
|
+
# Lazy-initialized shared DB connection and schema manager
|
|
21
|
+
self._db = None
|
|
22
|
+
self._schema_manager = None
|
|
23
|
+
# Prevent repeated error spam if DB logging fails
|
|
24
|
+
self._db_logging_enabled = True
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def from_crawler(cls, crawler):
|
|
28
|
+
"""Create extension instance from crawler"""
|
|
29
|
+
settings = Settings(crawler.settings)
|
|
30
|
+
return cls(settings)
|
|
31
|
+
|
|
32
|
+
def get_identifier_info(self, spider):
|
|
33
|
+
"""Get identifier column and value for the spider"""
|
|
34
|
+
return self.settings.get_identifier_column(), self.settings.get_identifier_value(spider)
|
|
35
|
+
|
|
36
|
+
def _ensure_db_initialized(self):
|
|
37
|
+
"""Initialize DB connection and schema manager lazily."""
|
|
38
|
+
if self._db is None:
|
|
39
|
+
self._db = DatabaseConnection(self.settings.db_url)
|
|
40
|
+
if not self._db.connect():
|
|
41
|
+
raise RuntimeError("Failed to connect to database for logging")
|
|
42
|
+
if self._schema_manager is None:
|
|
43
|
+
self._schema_manager = SchemaManager(self._db, self.settings)
|
|
44
|
+
|
|
45
|
+
def _ensure_logs_table_exists(self):
|
|
46
|
+
"""Create logs table if it doesn't exist (only if create_tables is True)."""
|
|
47
|
+
if not self.settings.create_tables:
|
|
48
|
+
return
|
|
49
|
+
try:
|
|
50
|
+
self._schema_manager.create_logs_table()
|
|
51
|
+
self._db.commit()
|
|
52
|
+
except Exception as e:
|
|
53
|
+
self._db.rollback()
|
|
54
|
+
|
|
55
|
+
def _log_to_database(self, spider, log_level, message):
|
|
56
|
+
"""Helper method to log messages to database using shared DBConnection."""
|
|
57
|
+
if not self._db_logging_enabled:
|
|
58
|
+
return
|
|
59
|
+
try:
|
|
60
|
+
self._ensure_db_initialized()
|
|
61
|
+
self._ensure_logs_table_exists()
|
|
62
|
+
|
|
63
|
+
identifier_column, identifier_value = self.get_identifier_info(spider)
|
|
64
|
+
sql = f"""
|
|
65
|
+
INSERT INTO {self.settings.db_logs_table}
|
|
66
|
+
({identifier_column}, level, message, timestamp)
|
|
67
|
+
VALUES (%s, %s, %s, %s)
|
|
68
|
+
"""
|
|
69
|
+
self._db.execute(
|
|
70
|
+
sql,
|
|
71
|
+
(
|
|
72
|
+
identifier_value,
|
|
73
|
+
log_level,
|
|
74
|
+
message,
|
|
75
|
+
get_current_datetime(self.settings),
|
|
76
|
+
),
|
|
77
|
+
)
|
|
78
|
+
self._db.commit()
|
|
79
|
+
except Exception as e:
|
|
80
|
+
# Disable further DB logging to avoid spamming errors
|
|
81
|
+
self._db_logging_enabled = False
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Logging extension for capturing spider errors and logs and saving them to the database.
|
|
3
|
+
"""
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import threading
|
|
8
|
+
from typing import List
|
|
9
|
+
|
|
10
|
+
from scrapy import signals
|
|
11
|
+
from scrapy.spiders import Spider
|
|
12
|
+
from scrapy.crawler import Crawler
|
|
13
|
+
|
|
14
|
+
from .base import BaseExtension
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ScrapyAndRootFilter(logging.Filter):
|
|
20
|
+
"""
|
|
21
|
+
A logging filter that allows records from the 'root' logger and any logger
|
|
22
|
+
within the 'scrapy' namespace.
|
|
23
|
+
"""
|
|
24
|
+
def filter(self, record: logging.LogRecord) -> bool:
|
|
25
|
+
# Allow logs from the spider itself (which might not be in 'scrapy' namespace)
|
|
26
|
+
if hasattr(record, 'spider_name') and record.name == getattr(record, 'spider_name', None):
|
|
27
|
+
return True
|
|
28
|
+
return record.name == 'root' or record.name.startswith('scrapy')
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DatabaseLogHandler(logging.Handler):
|
|
32
|
+
"""
|
|
33
|
+
Custom logging handler to save log records to the database in real-time.
|
|
34
|
+
"""
|
|
35
|
+
_local = threading.local()
|
|
36
|
+
|
|
37
|
+
def __init__(self, extension: 'LoggingExtension', spider: Spider):
|
|
38
|
+
super().__init__()
|
|
39
|
+
self.extension = extension
|
|
40
|
+
self.spider = spider
|
|
41
|
+
|
|
42
|
+
def emit(self, record: logging.LogRecord):
|
|
43
|
+
if getattr(self._local, 'in_emit', False):
|
|
44
|
+
return # Prevent recursion
|
|
45
|
+
|
|
46
|
+
# Avoid capturing logs generated by this extension's own exceptions
|
|
47
|
+
if 'extensions/logging.py' in record.pathname:
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
self._local.in_emit = True
|
|
51
|
+
try:
|
|
52
|
+
# Add spider name to record for the filter
|
|
53
|
+
record.spider_name = self.spider.name
|
|
54
|
+
msg = self.format(record)
|
|
55
|
+
level = record.levelname
|
|
56
|
+
# Log directly to the database in real-time
|
|
57
|
+
self.extension._log_to_database(self.spider, level, msg)
|
|
58
|
+
except Exception:
|
|
59
|
+
# Use logger directly to avoid recursion if formatting fails
|
|
60
|
+
logger.exception("Error in DatabaseLogHandler.emit")
|
|
61
|
+
finally:
|
|
62
|
+
self._local.in_emit = False
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class LoggingExtension(BaseExtension):
|
|
66
|
+
"""
|
|
67
|
+
Extension for logging spider events to the database.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(self, settings):
|
|
71
|
+
super().__init__(settings)
|
|
72
|
+
crawler_settings = self.settings.crawler_settings
|
|
73
|
+
self.log_level = crawler_settings.get('LOG_LEVEL', 'INFO').upper()
|
|
74
|
+
self.log_format = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
|
|
75
|
+
self.log_dateformat = '%Y-%m-%d %H:%M:%S'
|
|
76
|
+
|
|
77
|
+
self._db_log_handler: DatabaseLogHandler | None = None
|
|
78
|
+
self._root_logger_ref: logging.Logger | None = None
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def from_crawler(cls, crawler: Crawler):
|
|
82
|
+
"""Create an extension instance from crawler."""
|
|
83
|
+
ext = super().from_crawler(crawler)
|
|
84
|
+
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
|
|
85
|
+
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
|
|
86
|
+
crawler.signals.connect(ext.engine_stopped, signal=signals.engine_stopped)
|
|
87
|
+
return ext
|
|
88
|
+
|
|
89
|
+
def spider_opened(self, spider: Spider):
|
|
90
|
+
"""Called when a spider is opened."""
|
|
91
|
+
handler = DatabaseLogHandler(self, spider)
|
|
92
|
+
level = getattr(logging, self.log_level, logging.INFO)
|
|
93
|
+
handler.setLevel(level)
|
|
94
|
+
formatter = logging.Formatter(fmt=self.log_format, datefmt=self.log_dateformat)
|
|
95
|
+
handler.setFormatter(formatter)
|
|
96
|
+
|
|
97
|
+
handler.addFilter(ScrapyAndRootFilter())
|
|
98
|
+
|
|
99
|
+
self._db_log_handler = handler
|
|
100
|
+
|
|
101
|
+
root_logger = logging.getLogger()
|
|
102
|
+
|
|
103
|
+
if not any(isinstance(h, DatabaseLogHandler) for h in root_logger.handlers):
|
|
104
|
+
root_logger.addHandler(handler)
|
|
105
|
+
self._root_logger_ref = root_logger
|
|
106
|
+
|
|
107
|
+
identifier_column, identifier_value = self.get_identifier_info(spider)
|
|
108
|
+
message = f"{identifier_column.title()} {identifier_value} started"
|
|
109
|
+
spider.logger.info(message)
|
|
110
|
+
|
|
111
|
+
def spider_closed(self, spider: Spider, reason: str):
|
|
112
|
+
"""Called when a spider is closed."""
|
|
113
|
+
identifier_column, identifier_value = self.get_identifier_info(spider)
|
|
114
|
+
message = f"{identifier_column.title()} {identifier_value} closed with reason: {reason}"
|
|
115
|
+
spider.logger.info(message)
|
|
116
|
+
self._cleanup()
|
|
117
|
+
|
|
118
|
+
def engine_stopped(self):
|
|
119
|
+
"""Called when the Scrapy engine stops."""
|
|
120
|
+
self._cleanup()
|
|
121
|
+
|
|
122
|
+
def _cleanup(self):
|
|
123
|
+
"""Removes the log handler."""
|
|
124
|
+
if self._db_log_handler and self._root_logger_ref:
|
|
125
|
+
self._root_logger_ref.removeHandler(self._db_log_handler)
|
|
126
|
+
self._db_log_handler.close()
|
|
127
|
+
|
|
128
|
+
self._db_log_handler = None
|
|
129
|
+
self._root_logger_ref = None
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base pipeline functionality for scrapy_item_ingest.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from ..config.settings import Settings, validate_settings
|
|
7
|
+
from ..database.connection import DatabaseConnection
|
|
8
|
+
from ..database.schema import SchemaManager
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BasePipeline:
|
|
14
|
+
"""Base pipeline with common functionality"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, settings):
|
|
17
|
+
self.settings = settings
|
|
18
|
+
self.db = None
|
|
19
|
+
self.schema_manager = None
|
|
20
|
+
validate_settings(settings)
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def from_crawler(cls, crawler):
|
|
24
|
+
"""Create pipeline instance from crawler"""
|
|
25
|
+
settings = Settings(crawler.settings)
|
|
26
|
+
return cls(settings)
|
|
27
|
+
|
|
28
|
+
def open_spider(self, spider):
|
|
29
|
+
"""Called when spider is opened"""
|
|
30
|
+
# Establish database connection
|
|
31
|
+
self.db = DatabaseConnection(self.settings.db_url)
|
|
32
|
+
if not self.db.connect():
|
|
33
|
+
raise Exception("Failed to connect to database")
|
|
34
|
+
|
|
35
|
+
# Initialize schema manager
|
|
36
|
+
self.schema_manager = SchemaManager(self.db, self.settings)
|
|
37
|
+
|
|
38
|
+
# Ensure tables exist
|
|
39
|
+
self.schema_manager.ensure_tables_exist()
|
|
40
|
+
|
|
41
|
+
def close_spider(self, spider):
|
|
42
|
+
"""Called when spider is closed"""
|
|
43
|
+
if self.db:
|
|
44
|
+
self.db.close()
|
|
45
|
+
|
|
46
|
+
def get_identifier_info(self, spider):
|
|
47
|
+
"""Get identifier column and value for the spider"""
|
|
48
|
+
return self.settings.get_identifier_column(), self.settings.get_identifier_value(spider)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Items pipeline for storing scraped items.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from itemadapter import ItemAdapter
|
|
7
|
+
from scrapy.exceptions import DropItem
|
|
8
|
+
|
|
9
|
+
from .base import BasePipeline
|
|
10
|
+
from ..utils.serialization import serialize_item_data
|
|
11
|
+
from ..utils.time import get_current_datetime
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ItemsPipeline(BasePipeline):
|
|
17
|
+
"""Pipeline for handling scraped items"""
|
|
18
|
+
|
|
19
|
+
def process_item(self, item, spider):
|
|
20
|
+
"""Process and store item in database"""
|
|
21
|
+
job_id = self.settings.get_identifier_value(spider)
|
|
22
|
+
|
|
23
|
+
adapter = ItemAdapter(item)
|
|
24
|
+
item_dict = adapter.asdict()
|
|
25
|
+
created_at = get_current_datetime(self.settings)
|
|
26
|
+
|
|
27
|
+
# Store everything as JSON in the item column
|
|
28
|
+
try:
|
|
29
|
+
sql = f"INSERT INTO {self.settings.db_items_table} (job_id, item, created_at) VALUES (%s, %s, %s)"
|
|
30
|
+
json_data = serialize_item_data(item_dict)
|
|
31
|
+
|
|
32
|
+
self.db.execute(sql, (job_id, json_data, created_at))
|
|
33
|
+
self.db.commit()
|
|
34
|
+
except Exception as e:
|
|
35
|
+
self.db.rollback()
|
|
36
|
+
raise DropItem(f"DB insert error: {e}")
|
|
37
|
+
|
|
38
|
+
return item
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main pipeline that combines items and requests functionality.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from .items import ItemsPipeline
|
|
7
|
+
from .requests import RequestsPipeline
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DbInsertPipeline(ItemsPipeline, RequestsPipeline):
|
|
13
|
+
"""
|
|
14
|
+
Main pipeline that combines item processing and request tracking.
|
|
15
|
+
Inherits from both ItemsPipeline and RequestsPipeline.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, settings):
|
|
19
|
+
# Initialize both parent classes
|
|
20
|
+
ItemsPipeline.__init__(self, settings)
|
|
21
|
+
RequestsPipeline.__init__(self, settings)
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def from_crawler(cls, crawler):
|
|
25
|
+
"""Create pipeline instance from crawler"""
|
|
26
|
+
# Use RequestsPipeline's from_crawler to get signal connections
|
|
27
|
+
return RequestsPipeline.from_crawler.__func__(cls, crawler)
|
|
28
|
+
|
|
29
|
+
def open_spider(self, spider):
|
|
30
|
+
"""Called when spider is opened"""
|
|
31
|
+
# Use the base class implementation
|
|
32
|
+
super().open_spider(spider)
|
|
33
|
+
|
|
34
|
+
def close_spider(self, spider):
|
|
35
|
+
"""Called when spider is closed"""
|
|
36
|
+
# Use the base class implementation
|
|
37
|
+
super().close_spider(spider)
|
|
38
|
+
|
|
39
|
+
def process_item(self, item, spider):
|
|
40
|
+
"""Process and store item in database"""
|
|
41
|
+
# Use ItemsPipeline's process_item method
|
|
42
|
+
return ItemsPipeline.process_item(self, item, spider)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Requests pipeline for tracking request information.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from scrapy import signals
|
|
7
|
+
|
|
8
|
+
from .base import BasePipeline
|
|
9
|
+
from ..utils.fingerprint import get_request_fingerprint
|
|
10
|
+
from ..utils.time import get_current_datetime
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RequestsPipeline(BasePipeline):
|
|
16
|
+
"""Pipeline for handling request tracking"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, settings):
|
|
19
|
+
super().__init__(settings)
|
|
20
|
+
self.request_id_map = {} # Track fingerprint to database ID mapping
|
|
21
|
+
self.url_to_id_map = {} # Track URL to database ID mapping
|
|
22
|
+
self.current_response_url = None # Track current response being processed
|
|
23
|
+
self.request_start_times = {} # Track request start times for response_time calculation
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def from_crawler(cls, crawler):
|
|
27
|
+
"""Create pipeline instance from crawler"""
|
|
28
|
+
pipeline = super().from_crawler(crawler)
|
|
29
|
+
# Connect to both signals to track request timing
|
|
30
|
+
crawler.signals.connect(pipeline.request_scheduled, signal=signals.request_scheduled)
|
|
31
|
+
crawler.signals.connect(pipeline.response_received, signal=signals.response_received)
|
|
32
|
+
return pipeline
|
|
33
|
+
|
|
34
|
+
def _get_parent_request_info(self, request, spider):
|
|
35
|
+
"""Extract parent request information if available"""
|
|
36
|
+
parent_id = None
|
|
37
|
+
parent_url = None
|
|
38
|
+
|
|
39
|
+
# Get job_id for the current spider
|
|
40
|
+
job_id = self.settings.get_identifier_value(spider)
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
# Method 1: Use current response URL as parent (most reliable)
|
|
44
|
+
if self.current_response_url and self.current_response_url != request.url:
|
|
45
|
+
parent_url = self.current_response_url
|
|
46
|
+
if parent_url in self.url_to_id_map:
|
|
47
|
+
parent_id = self.url_to_id_map[parent_url]
|
|
48
|
+
|
|
49
|
+
# Method 2: Check request meta for referer
|
|
50
|
+
if not parent_id and hasattr(request, 'meta') and request.meta:
|
|
51
|
+
if 'referer' in request.meta:
|
|
52
|
+
parent_url = request.meta['referer']
|
|
53
|
+
|
|
54
|
+
# Look up in our URL mapping first (faster)
|
|
55
|
+
if parent_url in self.url_to_id_map:
|
|
56
|
+
parent_id = self.url_to_id_map[parent_url]
|
|
57
|
+
else:
|
|
58
|
+
# Look up in database
|
|
59
|
+
try:
|
|
60
|
+
sql = f"SELECT id FROM {self.settings.db_requests_table} WHERE url = %s AND job_id = %s ORDER BY created_at DESC LIMIT 1"
|
|
61
|
+
result = self.db.execute(sql, (parent_url, job_id))
|
|
62
|
+
if result:
|
|
63
|
+
parent_id = result[0]
|
|
64
|
+
# Cache the result
|
|
65
|
+
self.url_to_id_map[parent_url] = parent_id
|
|
66
|
+
|
|
67
|
+
except Exception as e:
|
|
68
|
+
logger.warning(f"Could not look up parent ID by referer URL: {e}")
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.warning(f"Could not extract parent request info: {e}")
|
|
72
|
+
|
|
73
|
+
return parent_id, parent_url
|
|
74
|
+
|
|
75
|
+
def log_request(self, request, spider, response=None):
|
|
76
|
+
"""Log request to database with complete information"""
|
|
77
|
+
job_id = self.settings.get_identifier_value(spider)
|
|
78
|
+
|
|
79
|
+
fingerprint = get_request_fingerprint(request)
|
|
80
|
+
parent_id, parent_url = self._get_parent_request_info(request, spider)
|
|
81
|
+
created_at = get_current_datetime(self.settings)
|
|
82
|
+
|
|
83
|
+
# Get status code and response time if response is available
|
|
84
|
+
status_code = response.status if response else None
|
|
85
|
+
response_time = None
|
|
86
|
+
|
|
87
|
+
if response:
|
|
88
|
+
fingerprint = get_request_fingerprint(request)
|
|
89
|
+
request_start_time = self.request_start_times.get(fingerprint)
|
|
90
|
+
if request_start_time:
|
|
91
|
+
current_time = created_at.timestamp()
|
|
92
|
+
response_time = current_time - request_start_time
|
|
93
|
+
# Clean up the start time to free memory
|
|
94
|
+
self.request_start_times.pop(fingerprint, None)
|
|
95
|
+
|
|
96
|
+
sql = f"""
|
|
97
|
+
INSERT INTO {self.settings.db_requests_table}
|
|
98
|
+
(job_id, url, method, fingerprint, parent_id, parent_url, status_code, response_time, created_at)
|
|
99
|
+
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id
|
|
100
|
+
"""
|
|
101
|
+
try:
|
|
102
|
+
result = self.db.execute(sql, (
|
|
103
|
+
job_id,
|
|
104
|
+
request.url,
|
|
105
|
+
request.method,
|
|
106
|
+
fingerprint,
|
|
107
|
+
parent_id,
|
|
108
|
+
parent_url,
|
|
109
|
+
status_code,
|
|
110
|
+
response_time,
|
|
111
|
+
created_at
|
|
112
|
+
))
|
|
113
|
+
|
|
114
|
+
# Get the inserted record ID and store it for future parent lookups
|
|
115
|
+
if result:
|
|
116
|
+
record_id = result[0]
|
|
117
|
+
self.request_id_map[fingerprint] = record_id
|
|
118
|
+
self.url_to_id_map[request.url] = record_id # Store URL to ID mapping
|
|
119
|
+
|
|
120
|
+
self.db.commit()
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.error(f"Failed to log request: {e}")
|
|
124
|
+
self.db.rollback()
|
|
125
|
+
|
|
126
|
+
def request_scheduled(self, request, spider):
|
|
127
|
+
"""Called when a request is scheduled - track start time"""
|
|
128
|
+
fingerprint = get_request_fingerprint(request)
|
|
129
|
+
current_time = get_current_datetime(self.settings).timestamp()
|
|
130
|
+
self.request_start_times[fingerprint] = current_time
|
|
131
|
+
|
|
132
|
+
def response_received(self, response, request, spider):
|
|
133
|
+
"""Called when a response is received - log request with complete info"""
|
|
134
|
+
|
|
135
|
+
self.current_response_url = response.url
|
|
136
|
+
|
|
137
|
+
# Log the request with complete response information
|
|
138
|
+
self.log_request(request, spider, response)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Request fingerprint utilities for generating unique request identifiers.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from scrapy.utils.request import fingerprint
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_request_fingerprint(request):
|
|
12
|
+
"""Generate a fingerprint for the request"""
|
|
13
|
+
|
|
14
|
+
fp = fingerprint(request)
|
|
15
|
+
|
|
16
|
+
if isinstance(fp, bytes):
|
|
17
|
+
fp = fp.hex()
|
|
18
|
+
|
|
19
|
+
fp = fp.replace("\\x", "")
|
|
20
|
+
|
|
21
|
+
return fp
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
3
|
+
import pytz
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_current_datetime(settings):
|
|
7
|
+
"""
|
|
8
|
+
Returns the current datetime localized to the timezone defined by settings.get_tz().
|
|
9
|
+
Raises a TypeError if settings is None or invalid.
|
|
10
|
+
"""
|
|
11
|
+
if settings is None:
|
|
12
|
+
raise TypeError("settings must not be None")
|
|
13
|
+
tzname = settings.get_tz()
|
|
14
|
+
try:
|
|
15
|
+
tz = pytz.timezone(tzname)
|
|
16
|
+
except Exception as e:
|
|
17
|
+
raise ValueError(f"invalid timezone '{tzname}'") from None
|
|
18
|
+
|
|
19
|
+
return tz.localize(datetime.now())
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scrapy_item_ingest
|
|
3
|
+
Version: 0.2.4
|
|
4
|
+
Summary: Scrapy extension for database ingestion with job/spider tracking
|
|
5
|
+
Home-page: https://github.com/fawadss1/scrapy_item_ingest
|
|
6
|
+
Author: Fawad Ali
|
|
7
|
+
Author-email: fawadstar6@gmail.com
|
|
8
|
+
Project-URL: Documentation, https://scrapy-item-ingest.readthedocs.io/
|
|
9
|
+
Project-URL: Source, https://github.com/fawadss1/scrapy_item_ingest
|
|
10
|
+
Project-URL: Tracker, https://github.com/fawadss1/scrapy_item_ingest/issues
|
|
11
|
+
Keywords: scrapy,database,postgresql,web-scraping,data-pipeline
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Framework :: Scrapy
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
25
|
+
Classifier: Topic :: Database
|
|
26
|
+
Requires-Python: >=3.7
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: scrapy>=2.13.3
|
|
30
|
+
Requires-Dist: psycopg2-binary>=2.9.10
|
|
31
|
+
Requires-Dist: itemadapter>=0.11.0
|
|
32
|
+
Requires-Dist: SQLAlchemy>=2.0.41
|
|
33
|
+
Requires-Dist: pytz>=2025.2
|
|
34
|
+
Provides-Extra: docs
|
|
35
|
+
Requires-Dist: sphinx>=5.0.0; extra == "docs"
|
|
36
|
+
Requires-Dist: sphinx_rtd_theme>=1.2.0; extra == "docs"
|
|
37
|
+
Requires-Dist: myst-parser>=0.18.0; extra == "docs"
|
|
38
|
+
Requires-Dist: sphinx-autodoc-typehints>=1.19.0; extra == "docs"
|
|
39
|
+
Requires-Dist: sphinx-copybutton>=0.5.0; extra == "docs"
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
42
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
43
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
44
|
+
Requires-Dist: flake8>=5.0.0; extra == "dev"
|
|
45
|
+
Requires-Dist: mypy>=0.991; extra == "dev"
|
|
46
|
+
Requires-Dist: pre-commit>=2.20.0; extra == "dev"
|
|
47
|
+
Provides-Extra: test
|
|
48
|
+
Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
49
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "test"
|
|
50
|
+
Requires-Dist: pytest-mock>=3.8.0; extra == "test"
|
|
51
|
+
Dynamic: author
|
|
52
|
+
Dynamic: author-email
|
|
53
|
+
Dynamic: classifier
|
|
54
|
+
Dynamic: description
|
|
55
|
+
Dynamic: description-content-type
|
|
56
|
+
Dynamic: home-page
|
|
57
|
+
Dynamic: keywords
|
|
58
|
+
Dynamic: license-file
|
|
59
|
+
Dynamic: project-url
|
|
60
|
+
Dynamic: provides-extra
|
|
61
|
+
Dynamic: requires-dist
|
|
62
|
+
Dynamic: requires-python
|
|
63
|
+
Dynamic: summary
|
|
64
|
+
|
|
65
|
+
# Scrapy Item Ingest
|
|
66
|
+
|
|
67
|
+
A tiny, straightforward addon for Scrapy that saves your items, requests, and logs to PostgreSQL. No boilerplate, no ceremony.
|
|
68
|
+
|
|
69
|
+
## Install
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install scrapy-item-ingest
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Minimal setup (settings.py)
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
ITEM_PIPELINES = {
|
|
79
|
+
'scrapy_item_ingest.DbInsertPipeline': 300,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
EXTENSIONS = {
|
|
83
|
+
'scrapy_item_ingest.LoggingExtension': 500,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Pick ONE of the two database config styles:
|
|
87
|
+
DB_URL = "postgresql://user:password@localhost:5432/database"
|
|
88
|
+
# Or use discrete fields (avoids URL encoding):
|
|
89
|
+
# DB_HOST = "localhost"
|
|
90
|
+
# DB_PORT = 5432
|
|
91
|
+
# DB_USER = "user"
|
|
92
|
+
# DB_PASSWORD = "password"
|
|
93
|
+
# DB_NAME = "database"
|
|
94
|
+
|
|
95
|
+
# Optional
|
|
96
|
+
CREATE_TABLES = True # auto‑create tables on first run (default True)
|
|
97
|
+
JOB_ID = 1 # or omit; spider name will be used
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Run your spider:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
scrapy crawl your_spider
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Troubleshooting
|
|
107
|
+
|
|
108
|
+
- Password has special characters like `@` or `$`?
|
|
109
|
+
- In a URL, encode them: `@` -> `%40`, `$` -> `%24`.
|
|
110
|
+
- Example: `postgresql://user:PAK%40swat1%24@localhost:5432/db`
|
|
111
|
+
- Or use the discrete fields (no encoding needed).
|
|
112
|
+
|
|
113
|
+
## Useful settings (optional)
|
|
114
|
+
|
|
115
|
+
- `LOG_DB_LEVEL` (default: `DEBUG`) — minimum level stored in DB
|
|
116
|
+
- `LOG_DB_CAPTURE_LEVEL` — capture level for Scrapy loggers routed to DB (does not affect console)
|
|
117
|
+
- `LOG_DB_LOGGERS` — allowed logger prefixes (defaults always include `[spider.name, 'scrapy']`)
|
|
118
|
+
- `LOG_DB_EXCLUDE_LOGGERS` (default: `['scrapy.core.scraper']`)
|
|
119
|
+
- `LOG_DB_EXCLUDE_PATTERNS` (default: `['Scraped from <']`)
|
|
120
|
+
- `CREATE_TABLES` (default: `True`) — create `job_items`, `job_requests`, `job_logs` on startup
|
|
121
|
+
- `ITEMS_TABLE`, `REQUESTS_TABLE`, `LOGS_TABLE` — override table names
|
|
122
|
+
|
|
123
|
+
## Links
|
|
124
|
+
|
|
125
|
+
- Docs: https://scrapy-item-ingest.readthedocs.io/
|
|
126
|
+
- Changelog: docs/development/changelog.rst
|
|
127
|
+
- Issues: https://github.com/fawadss1/scrapy_item_ingest/issues
|
|
128
|
+
|
|
129
|
+
## License
|
|
130
|
+
|
|
131
|
+
MIT License. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
scrapy_item_ingest/__init__.py,sha256=FofylFBUxWl6Xt5n14icxmbxteXOGpUc3PC1cirsnrU,1507
|
|
2
|
+
scrapy_item_ingest/config/__init__.py,sha256=Foyt52_KDRIoDZtSH5ttcWxQXCOUgzebo4IGCPQwriY,55
|
|
3
|
+
scrapy_item_ingest/config/settings.py,sha256=JrbRkF1_ZhrzrhQ0kj_wzoT7ksls5FtpbnaBh9CAKv4,5060
|
|
4
|
+
scrapy_item_ingest/database/__init__.py,sha256=-D9cfI8Hrap74UkIUmcOZ-ikAZ8HKSswZAZMBtjq69A,50
|
|
5
|
+
scrapy_item_ingest/database/connection.py,sha256=B4SGBz3zfh-GzpU-k-EiQeY1x1Rw9inkSDVNXYt6T88,5948
|
|
6
|
+
scrapy_item_ingest/database/schema.py,sha256=2HcBbW3VIWva59YCxyAinwZQDidFuyU5zuOCdCwBZUI,2866
|
|
7
|
+
scrapy_item_ingest/extensions/__init__.py,sha256=G8xe0Bssf4jFvi3D1gNyOpylaDHlo-RKHEX9_tIB2f8,51
|
|
8
|
+
scrapy_item_ingest/extensions/base.py,sha256=OWo44rGYOq3L_c-aZL48We_HJDHMjykQSMyaTFPQXgg,3001
|
|
9
|
+
scrapy_item_ingest/extensions/logging.py,sha256=F3tDA-PAx2E9oXcj1VFNvuC_xoavaPq2T85Rlw4B-vc,4850
|
|
10
|
+
scrapy_item_ingest/pipelines/__init__.py,sha256=NvbUeLCwjFPvVaSTzdnN6LkToJ1ISAM91EmVero9FXo,50
|
|
11
|
+
scrapy_item_ingest/pipelines/base.py,sha256=wTB-VTVOA35TPkPInPeLNMfy-2f7Ab3XM-VCOC964tQ,1521
|
|
12
|
+
scrapy_item_ingest/pipelines/items.py,sha256=-RVZ6PE0Zq5jplImvRsZUyu52ktNIZrqksreLCIxjs0,1180
|
|
13
|
+
scrapy_item_ingest/pipelines/main.py,sha256=tc1y4R8Roc9c3LU49Gfw9LrJ5SaZ2dPBSsMDb0ZlgWQ,1385
|
|
14
|
+
scrapy_item_ingest/pipelines/requests.py,sha256=Sw3b4ZP2DbRxz3l-Cek2chR37EhwC4b02dSZ47UpfjM,5823
|
|
15
|
+
scrapy_item_ingest/utils/__init__.py,sha256=xuzfL-u3NkFElIrBygQISYv0CKMdSVvreuL16JXZMRM,49
|
|
16
|
+
scrapy_item_ingest/utils/fingerprint.py,sha256=jOu2XAxG2WABrk9S6itrTvNqvQwxcjYT1omsVaE0Eyo,421
|
|
17
|
+
scrapy_item_ingest/utils/serialization.py,sha256=GjKEvAQV4oQUXP2hudreCIIBpH6GniQ3MvfFpODxHfk,251
|
|
18
|
+
scrapy_item_ingest/utils/time.py,sha256=YPtfwct4cFxhnhb-o1d9ZB_GI8DimPYsxTVQItdZ_Ao,547
|
|
19
|
+
scrapy_item_ingest-0.2.4.dist-info/licenses/LICENSE,sha256=DhJQ4_j45c_DWghISLKmJshcLvX_Pr7QXaahe2iRMNo,1087
|
|
20
|
+
scrapy_item_ingest-0.2.4.dist-info/METADATA,sha256=orMA19rYt3RtmPcDFntx6TsShzD1uYSx-CO7UPSNgn8,4534
|
|
21
|
+
scrapy_item_ingest-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
22
|
+
scrapy_item_ingest-0.2.4.dist-info/entry_points.txt,sha256=WKFpo9Dy0qX1S1PT8NvIHqZmSxBCgyAM480LnLR8S1E,172
|
|
23
|
+
scrapy_item_ingest-0.2.4.dist-info/top_level.txt,sha256=bu2ekFWcSH0ANdc8oGDdmZXaSC6kNuhtC-AggLsUQCU,19
|
|
24
|
+
scrapy_item_ingest-0.2.4.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Fawad Ali
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
scrapy_item_ingest
|