scrapy-item-ingest 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,48 @@
1
+ """
2
+ scrapy_item_ingest - A Scrapy extension for ingesting items and requests into databases.
3
+
4
+ This package provides pipelines and extensions for storing scraped data, tracking requests,
5
+ and logging spider events to PostgreSQL databases with support for both spider-based and
6
+ job-based identification.
7
+
8
+ Main Components:
9
+ - DbInsertPipeline: Combined pipeline for items and requests
10
+ - LoggingExtension: Extension for logging spider events
11
+ - ItemsPipeline: Standalone items processing pipeline
12
+ - RequestsPipeline: Standalone requests tracking pipeline
13
+ """
14
+
15
+ __version__ = "0.2.4"
16
+ __author__ = "Fawad Ali"
17
+ __description__ = "Scrapy extension for database ingestion with job/spider tracking"
18
+
19
+ # Import main classes directly from organized modules
20
+ from .pipelines.main import DbInsertPipeline
21
+ from .extensions.logging import LoggingExtension
22
+
23
+ # Import individual components for advanced users
24
+ from .pipelines.items import ItemsPipeline
25
+ from .pipelines.requests import RequestsPipeline
26
+
27
+ # Import configuration utilities
28
+ from .config.settings import Settings, validate_settings
29
+
30
+ # Define what gets imported with "from scrapy_item_ingest import *"
31
+ __all__ = [
32
+ # Main classes (most commonly used)
33
+ 'DbInsertPipeline',
34
+ 'LoggingExtension',
35
+
36
+ # Individual components
37
+ 'ItemsPipeline',
38
+ 'RequestsPipeline',
39
+
40
+ # Configuration
41
+ 'Settings',
42
+ 'validate_settings',
43
+
44
+ # Package metadata
45
+ '__version__',
46
+ '__author__',
47
+ '__description__',
48
+ ]
@@ -0,0 +1,2 @@
1
+ """Configuration modules for scrapy_item_ingest."""
2
+
@@ -0,0 +1,134 @@
1
+ """
2
+ Module for managing and validating crawler settings.
3
+
4
+ This module provides utility classes and functions for handling the settings of
5
+ a crawler, including the database configuration, operational parameters, and
6
+ customizable options. The primary class `Settings` provides an interface for
7
+ accessing settings dynamically and offers default fallbacks where values are
8
+ not explicitly defined. Utility function `validate_settings` ensures critical
9
+ configuration is present.
10
+ """
11
+
12
+
13
+ class Settings:
14
+ """
15
+ Handles settings configuration for crawlers, providing access to default values,
16
+ database table names, and other operational parameters defined in crawler settings.
17
+
18
+ This class facilitates the standardized management and retrieval of settings that
19
+ are essential for database operations and crawler configurations. Its purpose
20
+ is to provide default fallbacks and dynamically adapt to user-specified settings.
21
+ """
22
+
23
+ DEFAULT_ITEMS_TABLE = 'job_items'
24
+ DEFAULT_REQUESTS_TABLE = 'job_requests'
25
+ DEFAULT_LOGS_TABLE = 'job_logs'
26
+ DEFAULT_TIMEZONE = "Asia/Karachi"
27
+
28
+ def __init__(self, crawler_settings):
29
+ self.crawler_settings = crawler_settings
30
+
31
+ @property
32
+ def db_url(self):
33
+ """
34
+ Provides access to the database URL from the crawler settings.
35
+
36
+ This property is used to retrieve the database URL defined in the
37
+ crawler's settings. It is helpful when a database configuration
38
+ needs to be accessed dynamically.
39
+
40
+ :return: The database URL as defined in the crawler's configuration
41
+ :rtype: str or None
42
+ """
43
+ return self.crawler_settings.get('DB_URL')
44
+
45
+ @property
46
+ def db_type(self):
47
+ """
48
+ Retrieves the database type from the crawler settings.
49
+
50
+ This property fetches the value assigned to the key `DB_TYPE` within
51
+ the `crawler_settings`. Defaults to 'postgres' if the key is not set.
52
+
53
+ :return: The database type as a string.
54
+ :rtype: str
55
+ """
56
+ return self.crawler_settings.get('DB_TYPE', 'postgres')
57
+
58
+ @property
59
+ def db_items_table(self):
60
+ """Return static table name for items"""
61
+ return self.crawler_settings.get('ITEMS_TABLE', self.DEFAULT_ITEMS_TABLE)
62
+
63
+ @property
64
+ def db_requests_table(self):
65
+ """
66
+ This property fetches the name of the database table used to store request
67
+ information. It retrieves the value from crawler settings if defined;
68
+ otherwise, it defaults to the value of `DEFAULT_REQUESTS_TABLE`.
69
+
70
+ :return: Name of the database table for storing requests.
71
+ :rtype: str
72
+ """
73
+ return self.crawler_settings.get('REQUESTS_TABLE', self.DEFAULT_REQUESTS_TABLE)
74
+
75
+ @property
76
+ def db_logs_table(self):
77
+ """
78
+ Retrieve the name of the database logs table.
79
+
80
+ This property fetches the value of the database logs table name
81
+ provided in the crawler settings. If the value is not explicitly
82
+ defined in the settings, it falls back to the default logs table.
83
+
84
+ :return: The name of the database logs table.
85
+ :rtype: Str
86
+ """
87
+ return self.crawler_settings.get('LOGS_TABLE', self.DEFAULT_LOGS_TABLE)
88
+
89
+ @property
90
+ def create_tables(self):
91
+ """
92
+ Retrieve the setting for creating database tables from crawler settings.
93
+
94
+ This property fetches the value of the 'CREATE_TABLES' option from the crawler
95
+ settings. If the option is not specified in the settings, it defaults to True.
96
+
97
+ :return: Boolean value indicating whether to create tables.
98
+ :rtype: Bool
99
+ """
100
+ return self.crawler_settings.getbool('CREATE_TABLES', True)
101
+
102
+ def get_tz(self):
103
+ """
104
+ Return the timezone string for the project.
105
+ This checks for a 'TIMEZONE' setting in the crawler settings and falls back to the default ('Asia/Karachi').
106
+ Returns:
107
+ str: The timezone string (e.g., 'Asia/Karachi').
108
+ """
109
+ return self.crawler_settings.get('TIMEZONE', self.DEFAULT_TIMEZONE)
110
+
111
+ @staticmethod
112
+ def get_identifier_column():
113
+ """Get the identifier column name"""
114
+ return "job_id"
115
+
116
+ def get_identifier_value(self, spider):
117
+ """Get the identifier value with smart fallback"""
118
+ job_id = self.crawler_settings.get('JOB_ID', None)
119
+
120
+ if self.create_tables:
121
+ # When creating tables, use JOB_ID if provided, else spider name
122
+ return job_id if job_id else spider.name
123
+ else:
124
+ # When using existing tables, use JOB_ID if provided, else spider name
125
+ return job_id if job_id else spider.name
126
+
127
+
128
+ def validate_settings(settings):
129
+ """Validate configuration settings"""
130
+ if not settings.db_url:
131
+ raise ValueError("DB_URL must be set in settings")
132
+
133
+ # Job ID is now optional - will use spider name as fallback
134
+ return True
@@ -0,0 +1,2 @@
1
+ """Database modules for scrapy_item_ingest."""
2
+
@@ -0,0 +1,150 @@
1
+ # scrapy_item_ingest/database/connection.py
2
+
3
+ import logging
4
+ from typing import Optional, Any, Sequence
5
+ from urllib.parse import urlsplit, urlunsplit, quote, unquote
6
+
7
+ import psycopg2
8
+ from psycopg2 import OperationalError
9
+
10
+
11
+ class DBConnection:
12
+ """
13
+ PostgreSQL connection manager (singleton) with a small convenience API used by
14
+ pipelines and schema utilities. Supports either a DSN/URL or settings-based
15
+ configuration and exposes `connect/execute/commit/rollback/close` methods.
16
+ """
17
+
18
+ _instance = None # Singleton instance
19
+ _connection = None
20
+ _db_url: Optional[str] = None
21
+ _logger = logging.getLogger(__name__)
22
+
23
+ def __new__(cls, db_url: Optional[str] = None):
24
+ # Ensure only one instance exists (singleton) and accept optional db_url
25
+ if cls._instance is None:
26
+ cls._instance = super(DBConnection, cls).__new__(cls)
27
+ if db_url:
28
+ cls._instance._db_url = db_url
29
+ cls._instance._initialize_connection()
30
+ else:
31
+ # If an URL is passed later and we don't have one stored yet, keep it
32
+ if db_url and cls._instance._db_url is None:
33
+ cls._instance._db_url = db_url
34
+ # Do not auto-reconnect here; next use will reconnect if needed
35
+ return cls._instance
36
+
37
+ def _normalize_dsn(self, dsn: str) -> str:
38
+ """Normalize a PostgreSQL DSN/URL by URL-encoding credentials if needed.
39
+ Handles passwords that mistakenly include raw '@' or '$' by treating the last
40
+ '@' as the boundary between credentials and host.
41
+ """
42
+ try:
43
+ if "://" not in dsn:
44
+ return dsn
45
+ scheme, rest = dsn.split("://", 1)
46
+ # Separate netloc and remaining path/query/fragment
47
+ if "/" in rest:
48
+ netloc, tail = rest.split("/", 1)
49
+ tail = "/" + tail
50
+ else:
51
+ netloc, tail = rest, ""
52
+ if "@" in netloc:
53
+ userinfo, hostport = netloc.rsplit("@", 1)
54
+ if ":" in userinfo:
55
+ user, pwd = userinfo.split(":", 1)
56
+ # Encode only if password contains reserved characters
57
+ if any(c in pwd for c in "@:$ /\\"):
58
+ user_enc = quote(unquote(user), safe="")
59
+ pwd_enc = quote(pwd, safe="")
60
+ netloc = f"{user_enc}:{pwd_enc}@{hostport}"
61
+ return f"{scheme}://{netloc}{tail}"
62
+ except Exception:
63
+ return dsn
64
+
65
+ def _initialize_connection(self):
66
+ """Initialize the PostgreSQL connection once (or reconnect if closed)."""
67
+ if self._connection is not None and getattr(self._connection, "closed", 0) == 0:
68
+ return
69
+
70
+ source = "unknown"
71
+ try:
72
+ if self._db_url:
73
+ source = "db_url"
74
+ dsn = self._normalize_dsn(self._db_url)
75
+ self._connection = psycopg2.connect(dsn)
76
+ else:
77
+ # Lazy import to avoid module-level dependency on Scrapy
78
+ from scrapy.utils.project import get_project_settings
79
+ settings = get_project_settings()
80
+ source = "Scrapy settings"
81
+ self._connection = psycopg2.connect(
82
+ host=settings.get("DB_HOST"),
83
+ port=settings.get("DB_PORT"),
84
+ user=settings.get("DB_USER"),
85
+ password=settings.get("DB_PASSWORD"),
86
+ dbname=settings.get("DB_NAME"),
87
+ )
88
+ self._connection.autocommit = False # manual commit per item
89
+ except OperationalError as e:
90
+ # Mask password in logs by not printing full URL; provide hint
91
+ self._logger.error(
92
+ "Failed to connect to database via %s: %s. "
93
+ "Verify DB settings or DSN (host, port, user, dbname).",
94
+ source,
95
+ str(e),
96
+ )
97
+ raise
98
+
99
+ # Public API expected by pipelines/schema
100
+ def connect(self) -> bool:
101
+ try:
102
+ self._initialize_connection()
103
+ return True
104
+ except Exception:
105
+ return False
106
+
107
+ def cursor(self):
108
+ if self._connection is None or getattr(self._connection, "closed", 1):
109
+ self._initialize_connection()
110
+ return self._connection.cursor()
111
+
112
+ def execute(self, sql: str, params: Sequence[Any] = None):
113
+ """Execute a SQL statement.
114
+ Returns the first row (tuple) if the statement produces a result set
115
+ (e.g., SELECT or INSERT ... RETURNING), otherwise returns None.
116
+ """
117
+ with self.cursor() as cur:
118
+ if params is not None:
119
+ cur.execute(sql, params)
120
+ else:
121
+ cur.execute(sql)
122
+ # If the statement returns rows, fetch one for callers expecting a value
123
+ if cur.description is not None:
124
+ row = cur.fetchone()
125
+ return row
126
+ return None
127
+
128
+ def commit(self):
129
+ if self._connection:
130
+ self._connection.commit()
131
+
132
+ def rollback(self):
133
+ if self._connection:
134
+ self._connection.rollback()
135
+
136
+ def get_connection(self):
137
+ """Return the active connection (always the same one)."""
138
+ if self._connection is None or getattr(self._connection, "closed", 1):
139
+ self._initialize_connection()
140
+ return self._connection
141
+
142
+ def close(self):
143
+ """Close connection gracefully when the spider ends."""
144
+ if self._connection and not self._connection.closed:
145
+ self._connection.close()
146
+
147
+
148
+ # Backwards compatibility: older code imports `DatabaseConnection`
149
+ # Export an alias so both names work.
150
+ DatabaseConnection = DBConnection
@@ -0,0 +1,79 @@
1
+ """
2
+ Database schema management utilities for scrapy_item_ingest.
3
+ """
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class SchemaManager:
10
+ """Database schema management"""
11
+
12
+ def __init__(self, db_connection, settings):
13
+ self.db = db_connection
14
+ self.settings = settings
15
+
16
+ def create_items_table(self):
17
+ """Create items table if it doesn't exist"""
18
+ items_table_sql = f"""
19
+ CREATE TABLE IF NOT EXISTS {self.settings.db_items_table} (
20
+ id SERIAL PRIMARY KEY,
21
+ job_id VARCHAR(255),
22
+ item JSONB,
23
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
24
+ )
25
+ """
26
+ self.db.execute(items_table_sql)
27
+ logger.info(f"Items table {self.settings.db_items_table} created/verified with job_id column")
28
+
29
+ def create_requests_table(self):
30
+ """Create requests table if it doesn't exist"""
31
+ requests_table_sql = f"""
32
+ CREATE TABLE IF NOT EXISTS {self.settings.db_requests_table} (
33
+ id SERIAL PRIMARY KEY,
34
+ job_id VARCHAR(255),
35
+ url TEXT,
36
+ method VARCHAR(10),
37
+ status_code INTEGER,
38
+ duration FLOAT,
39
+ response_time FLOAT,
40
+ fingerprint VARCHAR(64),
41
+ parent_id INTEGER,
42
+ parent_url TEXT,
43
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
44
+ FOREIGN KEY (parent_id) REFERENCES {self.settings.db_requests_table}(id)
45
+ )
46
+ """
47
+ self.db.execute(requests_table_sql)
48
+ logger.info(f"Requests table {self.settings.db_requests_table} created/verified with job_id column")
49
+
50
+ def create_logs_table(self):
51
+ """Create logs table if it doesn't exist"""
52
+ logs_table_sql = f"""
53
+ CREATE TABLE IF NOT EXISTS {self.settings.db_logs_table} (
54
+ id SERIAL PRIMARY KEY,
55
+ job_id VARCHAR(255),
56
+ level VARCHAR(50),
57
+ message TEXT,
58
+ timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
59
+ )
60
+ """
61
+ self.db.execute(logs_table_sql)
62
+ logger.info(f"Logs table {self.settings.db_logs_table} created/verified with job_id column")
63
+
64
+ def ensure_tables_exist(self):
65
+ """Create all tables if they don't exist (only if create_tables is True)"""
66
+ if not self.settings.create_tables:
67
+ logger.info("Table creation disabled. Skipping table creation.")
68
+ return
69
+
70
+ try:
71
+ self.create_items_table()
72
+ self.create_requests_table()
73
+ self.create_logs_table()
74
+ self.db.commit()
75
+ logger.info("All tables created/verified successfully")
76
+ except Exception as e:
77
+ logger.error(f"Failed to create tables: {e}")
78
+ self.db.rollback()
79
+ raise
@@ -0,0 +1,2 @@
1
+ """Extension modules for scrapy_item_ingest."""
2
+
@@ -0,0 +1,81 @@
1
+ """
2
+ Base extension functionality for scrapy_item_ingest.
3
+ """
4
+ import logging
5
+
6
+ from scrapy_item_ingest.config.settings import Settings, validate_settings
7
+ from ..utils.time import get_current_datetime
8
+ from ..database.connection import DatabaseConnection
9
+ from ..database.schema import SchemaManager
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class BaseExtension:
15
+ """Base extension with common functionality"""
16
+
17
+ def __init__(self, settings):
18
+ self.settings = settings
19
+ validate_settings(settings)
20
+ # Lazy-initialized shared DB connection and schema manager
21
+ self._db = None
22
+ self._schema_manager = None
23
+ # Prevent repeated error spam if DB logging fails
24
+ self._db_logging_enabled = True
25
+
26
+ @classmethod
27
+ def from_crawler(cls, crawler):
28
+ """Create extension instance from crawler"""
29
+ settings = Settings(crawler.settings)
30
+ return cls(settings)
31
+
32
+ def get_identifier_info(self, spider):
33
+ """Get identifier column and value for the spider"""
34
+ return self.settings.get_identifier_column(), self.settings.get_identifier_value(spider)
35
+
36
+ def _ensure_db_initialized(self):
37
+ """Initialize DB connection and schema manager lazily."""
38
+ if self._db is None:
39
+ self._db = DatabaseConnection(self.settings.db_url)
40
+ if not self._db.connect():
41
+ raise RuntimeError("Failed to connect to database for logging")
42
+ if self._schema_manager is None:
43
+ self._schema_manager = SchemaManager(self._db, self.settings)
44
+
45
+ def _ensure_logs_table_exists(self):
46
+ """Create logs table if it doesn't exist (only if create_tables is True)."""
47
+ if not self.settings.create_tables:
48
+ return
49
+ try:
50
+ self._schema_manager.create_logs_table()
51
+ self._db.commit()
52
+ except Exception as e:
53
+ self._db.rollback()
54
+
55
+ def _log_to_database(self, spider, log_level, message):
56
+ """Helper method to log messages to database using shared DBConnection."""
57
+ if not self._db_logging_enabled:
58
+ return
59
+ try:
60
+ self._ensure_db_initialized()
61
+ self._ensure_logs_table_exists()
62
+
63
+ identifier_column, identifier_value = self.get_identifier_info(spider)
64
+ sql = f"""
65
+ INSERT INTO {self.settings.db_logs_table}
66
+ ({identifier_column}, level, message, timestamp)
67
+ VALUES (%s, %s, %s, %s)
68
+ """
69
+ self._db.execute(
70
+ sql,
71
+ (
72
+ identifier_value,
73
+ log_level,
74
+ message,
75
+ get_current_datetime(self.settings),
76
+ ),
77
+ )
78
+ self._db.commit()
79
+ except Exception as e:
80
+ # Disable further DB logging to avoid spamming errors
81
+ self._db_logging_enabled = False
@@ -0,0 +1,129 @@
1
+ """
2
+ Logging extension for capturing spider errors and logs and saving them to the database.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import logging
7
+ import threading
8
+ from typing import List
9
+
10
+ from scrapy import signals
11
+ from scrapy.spiders import Spider
12
+ from scrapy.crawler import Crawler
13
+
14
+ from .base import BaseExtension
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class ScrapyAndRootFilter(logging.Filter):
20
+ """
21
+ A logging filter that allows records from the 'root' logger and any logger
22
+ within the 'scrapy' namespace.
23
+ """
24
+ def filter(self, record: logging.LogRecord) -> bool:
25
+ # Allow logs from the spider itself (which might not be in 'scrapy' namespace)
26
+ if hasattr(record, 'spider_name') and record.name == getattr(record, 'spider_name', None):
27
+ return True
28
+ return record.name == 'root' or record.name.startswith('scrapy')
29
+
30
+
31
+ class DatabaseLogHandler(logging.Handler):
32
+ """
33
+ Custom logging handler to save log records to the database in real-time.
34
+ """
35
+ _local = threading.local()
36
+
37
+ def __init__(self, extension: 'LoggingExtension', spider: Spider):
38
+ super().__init__()
39
+ self.extension = extension
40
+ self.spider = spider
41
+
42
+ def emit(self, record: logging.LogRecord):
43
+ if getattr(self._local, 'in_emit', False):
44
+ return # Prevent recursion
45
+
46
+ # Avoid capturing logs generated by this extension's own exceptions
47
+ if 'extensions/logging.py' in record.pathname:
48
+ return
49
+
50
+ self._local.in_emit = True
51
+ try:
52
+ # Add spider name to record for the filter
53
+ record.spider_name = self.spider.name
54
+ msg = self.format(record)
55
+ level = record.levelname
56
+ # Log directly to the database in real-time
57
+ self.extension._log_to_database(self.spider, level, msg)
58
+ except Exception:
59
+ # Use logger directly to avoid recursion if formatting fails
60
+ logger.exception("Error in DatabaseLogHandler.emit")
61
+ finally:
62
+ self._local.in_emit = False
63
+
64
+
65
+ class LoggingExtension(BaseExtension):
66
+ """
67
+ Extension for logging spider events to the database.
68
+ """
69
+
70
+ def __init__(self, settings):
71
+ super().__init__(settings)
72
+ crawler_settings = self.settings.crawler_settings
73
+ self.log_level = crawler_settings.get('LOG_LEVEL', 'INFO').upper()
74
+ self.log_format = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
75
+ self.log_dateformat = '%Y-%m-%d %H:%M:%S'
76
+
77
+ self._db_log_handler: DatabaseLogHandler | None = None
78
+ self._root_logger_ref: logging.Logger | None = None
79
+
80
+ @classmethod
81
+ def from_crawler(cls, crawler: Crawler):
82
+ """Create an extension instance from crawler."""
83
+ ext = super().from_crawler(crawler)
84
+ crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
85
+ crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
86
+ crawler.signals.connect(ext.engine_stopped, signal=signals.engine_stopped)
87
+ return ext
88
+
89
+ def spider_opened(self, spider: Spider):
90
+ """Called when a spider is opened."""
91
+ handler = DatabaseLogHandler(self, spider)
92
+ level = getattr(logging, self.log_level, logging.INFO)
93
+ handler.setLevel(level)
94
+ formatter = logging.Formatter(fmt=self.log_format, datefmt=self.log_dateformat)
95
+ handler.setFormatter(formatter)
96
+
97
+ handler.addFilter(ScrapyAndRootFilter())
98
+
99
+ self._db_log_handler = handler
100
+
101
+ root_logger = logging.getLogger()
102
+
103
+ if not any(isinstance(h, DatabaseLogHandler) for h in root_logger.handlers):
104
+ root_logger.addHandler(handler)
105
+ self._root_logger_ref = root_logger
106
+
107
+ identifier_column, identifier_value = self.get_identifier_info(spider)
108
+ message = f"{identifier_column.title()} {identifier_value} started"
109
+ spider.logger.info(message)
110
+
111
+ def spider_closed(self, spider: Spider, reason: str):
112
+ """Called when a spider is closed."""
113
+ identifier_column, identifier_value = self.get_identifier_info(spider)
114
+ message = f"{identifier_column.title()} {identifier_value} closed with reason: {reason}"
115
+ spider.logger.info(message)
116
+ self._cleanup()
117
+
118
+ def engine_stopped(self):
119
+ """Called when the Scrapy engine stops."""
120
+ self._cleanup()
121
+
122
+ def _cleanup(self):
123
+ """Removes the log handler."""
124
+ if self._db_log_handler and self._root_logger_ref:
125
+ self._root_logger_ref.removeHandler(self._db_log_handler)
126
+ self._db_log_handler.close()
127
+
128
+ self._db_log_handler = None
129
+ self._root_logger_ref = None
@@ -0,0 +1,2 @@
1
+ """Pipeline modules for scrapy_item_ingest."""
2
+
@@ -0,0 +1,48 @@
1
+ """
2
+ Base pipeline functionality for scrapy_item_ingest.
3
+ """
4
+ import logging
5
+
6
+ from ..config.settings import Settings, validate_settings
7
+ from ..database.connection import DatabaseConnection
8
+ from ..database.schema import SchemaManager
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class BasePipeline:
14
+ """Base pipeline with common functionality"""
15
+
16
+ def __init__(self, settings):
17
+ self.settings = settings
18
+ self.db = None
19
+ self.schema_manager = None
20
+ validate_settings(settings)
21
+
22
+ @classmethod
23
+ def from_crawler(cls, crawler):
24
+ """Create pipeline instance from crawler"""
25
+ settings = Settings(crawler.settings)
26
+ return cls(settings)
27
+
28
+ def open_spider(self, spider):
29
+ """Called when spider is opened"""
30
+ # Establish database connection
31
+ self.db = DatabaseConnection(self.settings.db_url)
32
+ if not self.db.connect():
33
+ raise Exception("Failed to connect to database")
34
+
35
+ # Initialize schema manager
36
+ self.schema_manager = SchemaManager(self.db, self.settings)
37
+
38
+ # Ensure tables exist
39
+ self.schema_manager.ensure_tables_exist()
40
+
41
+ def close_spider(self, spider):
42
+ """Called when spider is closed"""
43
+ if self.db:
44
+ self.db.close()
45
+
46
+ def get_identifier_info(self, spider):
47
+ """Get identifier column and value for the spider"""
48
+ return self.settings.get_identifier_column(), self.settings.get_identifier_value(spider)
@@ -0,0 +1,38 @@
1
+ """
2
+ Items pipeline for storing scraped items.
3
+ """
4
+ import logging
5
+
6
+ from itemadapter import ItemAdapter
7
+ from scrapy.exceptions import DropItem
8
+
9
+ from .base import BasePipeline
10
+ from ..utils.serialization import serialize_item_data
11
+ from ..utils.time import get_current_datetime
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class ItemsPipeline(BasePipeline):
17
+ """Pipeline for handling scraped items"""
18
+
19
+ def process_item(self, item, spider):
20
+ """Process and store item in database"""
21
+ job_id = self.settings.get_identifier_value(spider)
22
+
23
+ adapter = ItemAdapter(item)
24
+ item_dict = adapter.asdict()
25
+ created_at = get_current_datetime(self.settings)
26
+
27
+ # Store everything as JSON in the item column
28
+ try:
29
+ sql = f"INSERT INTO {self.settings.db_items_table} (job_id, item, created_at) VALUES (%s, %s, %s)"
30
+ json_data = serialize_item_data(item_dict)
31
+
32
+ self.db.execute(sql, (job_id, json_data, created_at))
33
+ self.db.commit()
34
+ except Exception as e:
35
+ self.db.rollback()
36
+ raise DropItem(f"DB insert error: {e}")
37
+
38
+ return item
@@ -0,0 +1,42 @@
1
+ """
2
+ Main pipeline that combines items and requests functionality.
3
+ """
4
+ import logging
5
+
6
+ from .items import ItemsPipeline
7
+ from .requests import RequestsPipeline
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class DbInsertPipeline(ItemsPipeline, RequestsPipeline):
13
+ """
14
+ Main pipeline that combines item processing and request tracking.
15
+ Inherits from both ItemsPipeline and RequestsPipeline.
16
+ """
17
+
18
+ def __init__(self, settings):
19
+ # Initialize both parent classes
20
+ ItemsPipeline.__init__(self, settings)
21
+ RequestsPipeline.__init__(self, settings)
22
+
23
+ @classmethod
24
+ def from_crawler(cls, crawler):
25
+ """Create pipeline instance from crawler"""
26
+ # Use RequestsPipeline's from_crawler to get signal connections
27
+ return RequestsPipeline.from_crawler.__func__(cls, crawler)
28
+
29
+ def open_spider(self, spider):
30
+ """Called when spider is opened"""
31
+ # Use the base class implementation
32
+ super().open_spider(spider)
33
+
34
+ def close_spider(self, spider):
35
+ """Called when spider is closed"""
36
+ # Use the base class implementation
37
+ super().close_spider(spider)
38
+
39
+ def process_item(self, item, spider):
40
+ """Process and store item in database"""
41
+ # Use ItemsPipeline's process_item method
42
+ return ItemsPipeline.process_item(self, item, spider)
@@ -0,0 +1,138 @@
1
+ """
2
+ Requests pipeline for tracking request information.
3
+ """
4
+ import logging
5
+
6
+ from scrapy import signals
7
+
8
+ from .base import BasePipeline
9
+ from ..utils.fingerprint import get_request_fingerprint
10
+ from ..utils.time import get_current_datetime
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class RequestsPipeline(BasePipeline):
16
+ """Pipeline for handling request tracking"""
17
+
18
+ def __init__(self, settings):
19
+ super().__init__(settings)
20
+ self.request_id_map = {} # Track fingerprint to database ID mapping
21
+ self.url_to_id_map = {} # Track URL to database ID mapping
22
+ self.current_response_url = None # Track current response being processed
23
+ self.request_start_times = {} # Track request start times for response_time calculation
24
+
25
+ @classmethod
26
+ def from_crawler(cls, crawler):
27
+ """Create pipeline instance from crawler"""
28
+ pipeline = super().from_crawler(crawler)
29
+ # Connect to both signals to track request timing
30
+ crawler.signals.connect(pipeline.request_scheduled, signal=signals.request_scheduled)
31
+ crawler.signals.connect(pipeline.response_received, signal=signals.response_received)
32
+ return pipeline
33
+
34
+ def _get_parent_request_info(self, request, spider):
35
+ """Extract parent request information if available"""
36
+ parent_id = None
37
+ parent_url = None
38
+
39
+ # Get job_id for the current spider
40
+ job_id = self.settings.get_identifier_value(spider)
41
+
42
+ try:
43
+ # Method 1: Use current response URL as parent (most reliable)
44
+ if self.current_response_url and self.current_response_url != request.url:
45
+ parent_url = self.current_response_url
46
+ if parent_url in self.url_to_id_map:
47
+ parent_id = self.url_to_id_map[parent_url]
48
+
49
+ # Method 2: Check request meta for referer
50
+ if not parent_id and hasattr(request, 'meta') and request.meta:
51
+ if 'referer' in request.meta:
52
+ parent_url = request.meta['referer']
53
+
54
+ # Look up in our URL mapping first (faster)
55
+ if parent_url in self.url_to_id_map:
56
+ parent_id = self.url_to_id_map[parent_url]
57
+ else:
58
+ # Look up in database
59
+ try:
60
+ sql = f"SELECT id FROM {self.settings.db_requests_table} WHERE url = %s AND job_id = %s ORDER BY created_at DESC LIMIT 1"
61
+ result = self.db.execute(sql, (parent_url, job_id))
62
+ if result:
63
+ parent_id = result[0]
64
+ # Cache the result
65
+ self.url_to_id_map[parent_url] = parent_id
66
+
67
+ except Exception as e:
68
+ logger.warning(f"Could not look up parent ID by referer URL: {e}")
69
+
70
+ except Exception as e:
71
+ logger.warning(f"Could not extract parent request info: {e}")
72
+
73
+ return parent_id, parent_url
74
+
75
+ def log_request(self, request, spider, response=None):
76
+ """Log request to database with complete information"""
77
+ job_id = self.settings.get_identifier_value(spider)
78
+
79
+ fingerprint = get_request_fingerprint(request)
80
+ parent_id, parent_url = self._get_parent_request_info(request, spider)
81
+ created_at = get_current_datetime(self.settings)
82
+
83
+ # Get status code and response time if response is available
84
+ status_code = response.status if response else None
85
+ response_time = None
86
+
87
+ if response:
88
+ fingerprint = get_request_fingerprint(request)
89
+ request_start_time = self.request_start_times.get(fingerprint)
90
+ if request_start_time:
91
+ current_time = created_at.timestamp()
92
+ response_time = current_time - request_start_time
93
+ # Clean up the start time to free memory
94
+ self.request_start_times.pop(fingerprint, None)
95
+
96
+ sql = f"""
97
+ INSERT INTO {self.settings.db_requests_table}
98
+ (job_id, url, method, fingerprint, parent_id, parent_url, status_code, response_time, created_at)
99
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id
100
+ """
101
+ try:
102
+ result = self.db.execute(sql, (
103
+ job_id,
104
+ request.url,
105
+ request.method,
106
+ fingerprint,
107
+ parent_id,
108
+ parent_url,
109
+ status_code,
110
+ response_time,
111
+ created_at
112
+ ))
113
+
114
+ # Get the inserted record ID and store it for future parent lookups
115
+ if result:
116
+ record_id = result[0]
117
+ self.request_id_map[fingerprint] = record_id
118
+ self.url_to_id_map[request.url] = record_id # Store URL to ID mapping
119
+
120
+ self.db.commit()
121
+
122
+ except Exception as e:
123
+ logger.error(f"Failed to log request: {e}")
124
+ self.db.rollback()
125
+
126
+ def request_scheduled(self, request, spider):
127
+ """Called when a request is scheduled - track start time"""
128
+ fingerprint = get_request_fingerprint(request)
129
+ current_time = get_current_datetime(self.settings).timestamp()
130
+ self.request_start_times[fingerprint] = current_time
131
+
132
+ def response_received(self, response, request, spider):
133
+ """Called when a response is received - log request with complete info"""
134
+
135
+ self.current_response_url = response.url
136
+
137
+ # Log the request with complete response information
138
+ self.log_request(request, spider, response)
@@ -0,0 +1,2 @@
1
+ """Utility modules for scrapy_item_ingest."""
2
+
@@ -0,0 +1,21 @@
1
+ """
2
+ Request fingerprint utilities for generating unique request identifiers.
3
+ """
4
+ import logging
5
+
6
+ from scrapy.utils.request import fingerprint
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ def get_request_fingerprint(request):
12
+ """Generate a fingerprint for the request"""
13
+
14
+ fp = fingerprint(request)
15
+
16
+ if isinstance(fp, bytes):
17
+ fp = fp.hex()
18
+
19
+ fp = fp.replace("\\x", "")
20
+
21
+ return fp
@@ -0,0 +1,9 @@
1
+ """
2
+ Serialization utilities for converting data to JSON-serializable format.
3
+ """
4
+ import json
5
+
6
+
7
+ def serialize_item_data(item_dict):
8
+ """Serialize item data to JSON string"""
9
+ return json.dumps(item_dict, ensure_ascii=False, default=str)
@@ -0,0 +1,19 @@
1
+ from datetime import datetime
2
+
3
+ import pytz
4
+
5
+
6
+ def get_current_datetime(settings):
7
+ """
8
+ Returns the current datetime localized to the timezone defined by settings.get_tz().
9
+ Raises a TypeError if settings is None or invalid.
10
+ """
11
+ if settings is None:
12
+ raise TypeError("settings must not be None")
13
+ tzname = settings.get_tz()
14
+ try:
15
+ tz = pytz.timezone(tzname)
16
+ except Exception as e:
17
+ raise ValueError(f"invalid timezone '{tzname}'") from None
18
+
19
+ return tz.localize(datetime.now())
@@ -0,0 +1,131 @@
1
+ Metadata-Version: 2.4
2
+ Name: scrapy_item_ingest
3
+ Version: 0.2.4
4
+ Summary: Scrapy extension for database ingestion with job/spider tracking
5
+ Home-page: https://github.com/fawadss1/scrapy_item_ingest
6
+ Author: Fawad Ali
7
+ Author-email: fawadstar6@gmail.com
8
+ Project-URL: Documentation, https://scrapy-item-ingest.readthedocs.io/
9
+ Project-URL: Source, https://github.com/fawadss1/scrapy_item_ingest
10
+ Project-URL: Tracker, https://github.com/fawadss1/scrapy_item_ingest/issues
11
+ Keywords: scrapy,database,postgresql,web-scraping,data-pipeline
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.7
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Framework :: Scrapy
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Classifier: Topic :: Internet :: WWW/HTTP
25
+ Classifier: Topic :: Database
26
+ Requires-Python: >=3.7
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: scrapy>=2.13.3
30
+ Requires-Dist: psycopg2-binary>=2.9.10
31
+ Requires-Dist: itemadapter>=0.11.0
32
+ Requires-Dist: SQLAlchemy>=2.0.41
33
+ Requires-Dist: pytz>=2025.2
34
+ Provides-Extra: docs
35
+ Requires-Dist: sphinx>=5.0.0; extra == "docs"
36
+ Requires-Dist: sphinx_rtd_theme>=1.2.0; extra == "docs"
37
+ Requires-Dist: myst-parser>=0.18.0; extra == "docs"
38
+ Requires-Dist: sphinx-autodoc-typehints>=1.19.0; extra == "docs"
39
+ Requires-Dist: sphinx-copybutton>=0.5.0; extra == "docs"
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
42
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
43
+ Requires-Dist: black>=22.0.0; extra == "dev"
44
+ Requires-Dist: flake8>=5.0.0; extra == "dev"
45
+ Requires-Dist: mypy>=0.991; extra == "dev"
46
+ Requires-Dist: pre-commit>=2.20.0; extra == "dev"
47
+ Provides-Extra: test
48
+ Requires-Dist: pytest>=7.0.0; extra == "test"
49
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
50
+ Requires-Dist: pytest-mock>=3.8.0; extra == "test"
51
+ Dynamic: author
52
+ Dynamic: author-email
53
+ Dynamic: classifier
54
+ Dynamic: description
55
+ Dynamic: description-content-type
56
+ Dynamic: home-page
57
+ Dynamic: keywords
58
+ Dynamic: license-file
59
+ Dynamic: project-url
60
+ Dynamic: provides-extra
61
+ Dynamic: requires-dist
62
+ Dynamic: requires-python
63
+ Dynamic: summary
64
+
65
+ # Scrapy Item Ingest
66
+
67
+ A tiny, straightforward addon for Scrapy that saves your items, requests, and logs to PostgreSQL. No boilerplate, no ceremony.
68
+
69
+ ## Install
70
+
71
+ ```bash
72
+ pip install scrapy-item-ingest
73
+ ```
74
+
75
+ ## Minimal setup (settings.py)
76
+
77
+ ```python
78
+ ITEM_PIPELINES = {
79
+ 'scrapy_item_ingest.DbInsertPipeline': 300,
80
+ }
81
+
82
+ EXTENSIONS = {
83
+ 'scrapy_item_ingest.LoggingExtension': 500,
84
+ }
85
+
86
+ # Pick ONE of the two database config styles:
87
+ DB_URL = "postgresql://user:password@localhost:5432/database"
88
+ # Or use discrete fields (avoids URL encoding):
89
+ # DB_HOST = "localhost"
90
+ # DB_PORT = 5432
91
+ # DB_USER = "user"
92
+ # DB_PASSWORD = "password"
93
+ # DB_NAME = "database"
94
+
95
+ # Optional
96
+ CREATE_TABLES = True # auto‑create tables on first run (default True)
97
+ JOB_ID = 1 # or omit; spider name will be used
98
+ ```
99
+
100
+ Run your spider:
101
+
102
+ ```bash
103
+ scrapy crawl your_spider
104
+ ```
105
+
106
+ ## Troubleshooting
107
+
108
+ - Password has special characters like `@` or `$`?
109
+ - In a URL, encode them: `@` -> `%40`, `$` -> `%24`.
110
+ - Example: `postgresql://user:PAK%40swat1%24@localhost:5432/db`
111
+ - Or use the discrete fields (no encoding needed).
112
+
113
+ ## Useful settings (optional)
114
+
115
+ - `LOG_DB_LEVEL` (default: `DEBUG`) — minimum level stored in DB
116
+ - `LOG_DB_CAPTURE_LEVEL` — capture level for Scrapy loggers routed to DB (does not affect console)
117
+ - `LOG_DB_LOGGERS` — allowed logger prefixes (defaults always include `[spider.name, 'scrapy']`)
118
+ - `LOG_DB_EXCLUDE_LOGGERS` (default: `['scrapy.core.scraper']`)
119
+ - `LOG_DB_EXCLUDE_PATTERNS` (default: `['Scraped from <']`)
120
+ - `CREATE_TABLES` (default: `True`) — create `job_items`, `job_requests`, `job_logs` on startup
121
+ - `ITEMS_TABLE`, `REQUESTS_TABLE`, `LOGS_TABLE` — override table names
122
+
123
+ ## Links
124
+
125
+ - Docs: https://scrapy-item-ingest.readthedocs.io/
126
+ - Changelog: docs/development/changelog.rst
127
+ - Issues: https://github.com/fawadss1/scrapy_item_ingest/issues
128
+
129
+ ## License
130
+
131
+ MIT License. See [LICENSE](LICENSE).
@@ -0,0 +1,24 @@
1
+ scrapy_item_ingest/__init__.py,sha256=FofylFBUxWl6Xt5n14icxmbxteXOGpUc3PC1cirsnrU,1507
2
+ scrapy_item_ingest/config/__init__.py,sha256=Foyt52_KDRIoDZtSH5ttcWxQXCOUgzebo4IGCPQwriY,55
3
+ scrapy_item_ingest/config/settings.py,sha256=JrbRkF1_ZhrzrhQ0kj_wzoT7ksls5FtpbnaBh9CAKv4,5060
4
+ scrapy_item_ingest/database/__init__.py,sha256=-D9cfI8Hrap74UkIUmcOZ-ikAZ8HKSswZAZMBtjq69A,50
5
+ scrapy_item_ingest/database/connection.py,sha256=B4SGBz3zfh-GzpU-k-EiQeY1x1Rw9inkSDVNXYt6T88,5948
6
+ scrapy_item_ingest/database/schema.py,sha256=2HcBbW3VIWva59YCxyAinwZQDidFuyU5zuOCdCwBZUI,2866
7
+ scrapy_item_ingest/extensions/__init__.py,sha256=G8xe0Bssf4jFvi3D1gNyOpylaDHlo-RKHEX9_tIB2f8,51
8
+ scrapy_item_ingest/extensions/base.py,sha256=OWo44rGYOq3L_c-aZL48We_HJDHMjykQSMyaTFPQXgg,3001
9
+ scrapy_item_ingest/extensions/logging.py,sha256=F3tDA-PAx2E9oXcj1VFNvuC_xoavaPq2T85Rlw4B-vc,4850
10
+ scrapy_item_ingest/pipelines/__init__.py,sha256=NvbUeLCwjFPvVaSTzdnN6LkToJ1ISAM91EmVero9FXo,50
11
+ scrapy_item_ingest/pipelines/base.py,sha256=wTB-VTVOA35TPkPInPeLNMfy-2f7Ab3XM-VCOC964tQ,1521
12
+ scrapy_item_ingest/pipelines/items.py,sha256=-RVZ6PE0Zq5jplImvRsZUyu52ktNIZrqksreLCIxjs0,1180
13
+ scrapy_item_ingest/pipelines/main.py,sha256=tc1y4R8Roc9c3LU49Gfw9LrJ5SaZ2dPBSsMDb0ZlgWQ,1385
14
+ scrapy_item_ingest/pipelines/requests.py,sha256=Sw3b4ZP2DbRxz3l-Cek2chR37EhwC4b02dSZ47UpfjM,5823
15
+ scrapy_item_ingest/utils/__init__.py,sha256=xuzfL-u3NkFElIrBygQISYv0CKMdSVvreuL16JXZMRM,49
16
+ scrapy_item_ingest/utils/fingerprint.py,sha256=jOu2XAxG2WABrk9S6itrTvNqvQwxcjYT1omsVaE0Eyo,421
17
+ scrapy_item_ingest/utils/serialization.py,sha256=GjKEvAQV4oQUXP2hudreCIIBpH6GniQ3MvfFpODxHfk,251
18
+ scrapy_item_ingest/utils/time.py,sha256=YPtfwct4cFxhnhb-o1d9ZB_GI8DimPYsxTVQItdZ_Ao,547
19
+ scrapy_item_ingest-0.2.4.dist-info/licenses/LICENSE,sha256=DhJQ4_j45c_DWghISLKmJshcLvX_Pr7QXaahe2iRMNo,1087
20
+ scrapy_item_ingest-0.2.4.dist-info/METADATA,sha256=orMA19rYt3RtmPcDFntx6TsShzD1uYSx-CO7UPSNgn8,4534
21
+ scrapy_item_ingest-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
22
+ scrapy_item_ingest-0.2.4.dist-info/entry_points.txt,sha256=WKFpo9Dy0qX1S1PT8NvIHqZmSxBCgyAM480LnLR8S1E,172
23
+ scrapy_item_ingest-0.2.4.dist-info/top_level.txt,sha256=bu2ekFWcSH0ANdc8oGDdmZXaSC6kNuhtC-AggLsUQCU,19
24
+ scrapy_item_ingest-0.2.4.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,5 @@
1
+ [scrapy.extensions]
2
+ logging_ext = scrapy_item_ingest.extensions.logging:LoggingExtension
3
+
4
+ [scrapy.pipelines]
5
+ db_ingest = scrapy_item_ingest.pipelines.main:DbInsertPipeline
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Fawad Ali
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ scrapy_item_ingest