scrapy-item-ingest 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of scrapy-item-ingest might be problematic. Click here for more details.
- scrapy_item_ingest/__init__.py +48 -0
- scrapy_item_ingest/config/__init__.py +2 -0
- scrapy_item_ingest/config/settings.py +82 -0
- scrapy_item_ingest/database/__init__.py +2 -0
- scrapy_item_ingest/database/connection.py +67 -0
- scrapy_item_ingest/database/schema.py +79 -0
- scrapy_item_ingest/extensions/__init__.py +2 -0
- scrapy_item_ingest/extensions/base.py +79 -0
- scrapy_item_ingest/extensions/logging.py +45 -0
- scrapy_item_ingest/pipelines/__init__.py +2 -0
- scrapy_item_ingest/pipelines/base.py +50 -0
- scrapy_item_ingest/pipelines/items.py +42 -0
- scrapy_item_ingest/pipelines/main.py +41 -0
- scrapy_item_ingest/pipelines/requests.py +169 -0
- scrapy_item_ingest/utils/__init__.py +2 -0
- scrapy_item_ingest/utils/fingerprint.py +25 -0
- scrapy_item_ingest/utils/serialization.py +28 -0
- scrapy_item_ingest-0.1.0.dist-info/METADATA +132 -0
- scrapy_item_ingest-0.1.0.dist-info/RECORD +23 -0
- scrapy_item_ingest-0.1.0.dist-info/WHEEL +5 -0
- scrapy_item_ingest-0.1.0.dist-info/entry_points.txt +5 -0
- scrapy_item_ingest-0.1.0.dist-info/licenses/LICENSE +21 -0
- scrapy_item_ingest-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scrapy_item_ingest - A Scrapy extension for ingesting items and requests into databases.
|
|
3
|
+
|
|
4
|
+
This package provides pipelines and extensions for storing scraped data, tracking requests,
|
|
5
|
+
and logging spider events to PostgreSQL databases with support for both spider-based and
|
|
6
|
+
job-based identification.
|
|
7
|
+
|
|
8
|
+
Main Components:
|
|
9
|
+
- DbInsertPipeline: Combined pipeline for items and requests
|
|
10
|
+
- LoggingExtension: Extension for logging spider events
|
|
11
|
+
- ItemsPipeline: Standalone items processing pipeline
|
|
12
|
+
- RequestsPipeline: Standalone requests tracking pipeline
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
__version__ = "0.1.0"
|
|
16
|
+
__author__ = "Fawad Ali"
|
|
17
|
+
__description__ = "Scrapy extension for database ingestion with job/spider tracking"
|
|
18
|
+
|
|
19
|
+
# Import main classes directly from organized modules
|
|
20
|
+
from .pipelines.main import DbInsertPipeline
|
|
21
|
+
from .extensions.logging import LoggingExtension
|
|
22
|
+
|
|
23
|
+
# Import individual components for advanced users
|
|
24
|
+
from .pipelines.items import ItemsPipeline
|
|
25
|
+
from .pipelines.requests import RequestsPipeline
|
|
26
|
+
|
|
27
|
+
# Import configuration utilities
|
|
28
|
+
from .config.settings import Settings, validate_settings
|
|
29
|
+
|
|
30
|
+
# Define what gets imported with "from scrapy_item_ingest import *"
|
|
31
|
+
__all__ = [
|
|
32
|
+
# Main classes (most commonly used)
|
|
33
|
+
'DbInsertPipeline',
|
|
34
|
+
'LoggingExtension',
|
|
35
|
+
|
|
36
|
+
# Individual components
|
|
37
|
+
'ItemsPipeline',
|
|
38
|
+
'RequestsPipeline',
|
|
39
|
+
|
|
40
|
+
# Configuration
|
|
41
|
+
'Settings',
|
|
42
|
+
'validate_settings',
|
|
43
|
+
|
|
44
|
+
# Package metadata
|
|
45
|
+
'__version__',
|
|
46
|
+
'__author__',
|
|
47
|
+
'__description__',
|
|
48
|
+
]
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration settings and utilities for scrapy_item_ingest.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# Static table names - no longer configurable
|
|
6
|
+
DEFAULT_ITEMS_TABLE = 'job_items'
|
|
7
|
+
DEFAULT_REQUESTS_TABLE = 'job_requests'
|
|
8
|
+
DEFAULT_LOGS_TABLE = 'job_logs'
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Settings:
|
|
12
|
+
"""Settings class to handle configuration options"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, crawler_settings):
|
|
15
|
+
self.crawler_settings = crawler_settings
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def db_url(self):
|
|
19
|
+
return self.crawler_settings.get('DB_URL')
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def db_type(self):
|
|
23
|
+
return self.crawler_settings.get('DB_TYPE', 'postgres')
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def db_items_table(self):
|
|
27
|
+
"""Return static table name for items"""
|
|
28
|
+
return DEFAULT_ITEMS_TABLE
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def db_requests_table(self):
|
|
32
|
+
"""Return static table name for requests"""
|
|
33
|
+
return DEFAULT_REQUESTS_TABLE
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def db_logs_table(self):
|
|
37
|
+
"""Return static table name for logs"""
|
|
38
|
+
return DEFAULT_LOGS_TABLE
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def create_tables(self):
|
|
42
|
+
return self.crawler_settings.getbool('CREATE_TABLES', True)
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def use_job_id(self):
|
|
46
|
+
# JOB_ID only works when CREATE_TABLES = False
|
|
47
|
+
if self.create_tables:
|
|
48
|
+
return False # Don't use JOB_ID when creating tables
|
|
49
|
+
else:
|
|
50
|
+
return True # Use JOB_ID when using existing tables
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def job_id(self):
|
|
54
|
+
# Always return JOB_ID or fallback to None (spider name will be used)
|
|
55
|
+
return self.crawler_settings.get('JOB_ID', None)
|
|
56
|
+
|
|
57
|
+
def get_identifier_column(self):
|
|
58
|
+
"""Get the identifier column name based on mode"""
|
|
59
|
+
if self.create_tables:
|
|
60
|
+
return "spider" # Use spider column when creating tables
|
|
61
|
+
else:
|
|
62
|
+
return "job_id" # Use job_id column when using existing tables
|
|
63
|
+
|
|
64
|
+
def get_identifier_value(self, spider):
|
|
65
|
+
"""Get the identifier value with smart fallback"""
|
|
66
|
+
job_id = self.crawler_settings.get('JOB_ID', None)
|
|
67
|
+
|
|
68
|
+
if self.create_tables:
|
|
69
|
+
# When creating tables, use JOB_ID if provided, else spider name
|
|
70
|
+
return job_id if job_id else spider.name
|
|
71
|
+
else:
|
|
72
|
+
# When using existing tables, use JOB_ID if provided, else spider name
|
|
73
|
+
return job_id if job_id else spider.name
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def validate_settings(settings):
|
|
77
|
+
"""Validate configuration settings"""
|
|
78
|
+
if not settings.db_url:
|
|
79
|
+
raise ValueError("DB_URL must be set in settings")
|
|
80
|
+
|
|
81
|
+
# Job ID is now optional - will use spider name as fallback
|
|
82
|
+
return True
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Database connection utilities for scrapy_item_ingest.
|
|
3
|
+
"""
|
|
4
|
+
import psycopg2
|
|
5
|
+
import logging
|
|
6
|
+
from urllib.parse import urlparse, unquote
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DatabaseConnection:
|
|
12
|
+
"""Database connection manager"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, db_url):
|
|
15
|
+
self.db_url = db_url
|
|
16
|
+
self.conn = None
|
|
17
|
+
self.cur = None
|
|
18
|
+
|
|
19
|
+
def connect(self):
|
|
20
|
+
"""Establish database connection"""
|
|
21
|
+
try:
|
|
22
|
+
result = urlparse(self.db_url)
|
|
23
|
+
user = result.username
|
|
24
|
+
password = unquote(result.password) if result.password else None
|
|
25
|
+
host = result.hostname
|
|
26
|
+
port = result.port
|
|
27
|
+
dbname = result.path.lstrip('/')
|
|
28
|
+
|
|
29
|
+
self.conn = psycopg2.connect(
|
|
30
|
+
host=host, port=port, dbname=dbname,
|
|
31
|
+
user=user, password=password
|
|
32
|
+
)
|
|
33
|
+
self.cur = self.conn.cursor()
|
|
34
|
+
logger.info("Database connection established")
|
|
35
|
+
return True
|
|
36
|
+
except Exception as e:
|
|
37
|
+
logger.error(f"Failed to connect to database: {e}")
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
def close(self):
|
|
41
|
+
"""Close database connection"""
|
|
42
|
+
if hasattr(self, 'cur') and self.cur:
|
|
43
|
+
self.cur.close()
|
|
44
|
+
if hasattr(self, 'conn') and self.conn:
|
|
45
|
+
self.conn.close()
|
|
46
|
+
logger.info("Database connection closed")
|
|
47
|
+
|
|
48
|
+
def execute(self, sql, params=None):
|
|
49
|
+
"""Execute SQL query"""
|
|
50
|
+
try:
|
|
51
|
+
if params:
|
|
52
|
+
self.cur.execute(sql, params)
|
|
53
|
+
else:
|
|
54
|
+
self.cur.execute(sql)
|
|
55
|
+
return self.cur.fetchone() if self.cur.description else None
|
|
56
|
+
except Exception as e:
|
|
57
|
+
logger.error(f"Failed to execute query: {e}")
|
|
58
|
+
self.conn.rollback()
|
|
59
|
+
raise
|
|
60
|
+
|
|
61
|
+
def commit(self):
|
|
62
|
+
"""Commit transaction"""
|
|
63
|
+
self.conn.commit()
|
|
64
|
+
|
|
65
|
+
def rollback(self):
|
|
66
|
+
"""Rollback transaction"""
|
|
67
|
+
self.conn.rollback()
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Database schema management utilities for scrapy_item_ingest.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SchemaManager:
|
|
10
|
+
"""Database schema management"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, db_connection, settings):
|
|
13
|
+
self.db = db_connection
|
|
14
|
+
self.settings = settings
|
|
15
|
+
|
|
16
|
+
def create_items_table(self):
|
|
17
|
+
"""Create items table if it doesn't exist"""
|
|
18
|
+
items_table_sql = f"""
|
|
19
|
+
CREATE TABLE IF NOT EXISTS {self.settings.db_items_table} (
|
|
20
|
+
id SERIAL PRIMARY KEY,
|
|
21
|
+
job_id VARCHAR(255),
|
|
22
|
+
item JSONB,
|
|
23
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
24
|
+
)
|
|
25
|
+
"""
|
|
26
|
+
self.db.execute(items_table_sql)
|
|
27
|
+
logger.info(f"Items table {self.settings.db_items_table} created/verified with job_id column")
|
|
28
|
+
|
|
29
|
+
def create_requests_table(self):
|
|
30
|
+
"""Create requests table if it doesn't exist"""
|
|
31
|
+
requests_table_sql = f"""
|
|
32
|
+
CREATE TABLE IF NOT EXISTS {self.settings.db_requests_table} (
|
|
33
|
+
id SERIAL PRIMARY KEY,
|
|
34
|
+
job_id VARCHAR(255),
|
|
35
|
+
url TEXT,
|
|
36
|
+
method VARCHAR(10),
|
|
37
|
+
status_code INTEGER,
|
|
38
|
+
duration FLOAT,
|
|
39
|
+
response_time FLOAT,
|
|
40
|
+
fingerprint VARCHAR(64),
|
|
41
|
+
parent_id INTEGER,
|
|
42
|
+
parent_url TEXT,
|
|
43
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
44
|
+
FOREIGN KEY (parent_id) REFERENCES {self.settings.db_requests_table}(id)
|
|
45
|
+
)
|
|
46
|
+
"""
|
|
47
|
+
self.db.execute(requests_table_sql)
|
|
48
|
+
logger.info(f"Requests table {self.settings.db_requests_table} created/verified with job_id column")
|
|
49
|
+
|
|
50
|
+
def create_logs_table(self):
|
|
51
|
+
"""Create logs table if it doesn't exist"""
|
|
52
|
+
logs_table_sql = f"""
|
|
53
|
+
CREATE TABLE IF NOT EXISTS {self.settings.db_logs_table} (
|
|
54
|
+
id SERIAL PRIMARY KEY,
|
|
55
|
+
job_id VARCHAR(255),
|
|
56
|
+
level VARCHAR(50),
|
|
57
|
+
message TEXT,
|
|
58
|
+
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
59
|
+
)
|
|
60
|
+
"""
|
|
61
|
+
self.db.execute(logs_table_sql)
|
|
62
|
+
logger.info(f"Logs table {self.settings.db_logs_table} created/verified with job_id column")
|
|
63
|
+
|
|
64
|
+
def ensure_tables_exist(self):
|
|
65
|
+
"""Create all tables if they don't exist (only if create_tables is True)"""
|
|
66
|
+
if not self.settings.create_tables:
|
|
67
|
+
logger.info("Table creation disabled. Skipping table creation.")
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
self.create_items_table()
|
|
72
|
+
self.create_requests_table()
|
|
73
|
+
self.create_logs_table()
|
|
74
|
+
self.db.commit()
|
|
75
|
+
logger.info("All tables created/verified successfully")
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f"Failed to create tables: {e}")
|
|
78
|
+
self.db.rollback()
|
|
79
|
+
raise
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base extension functionality for scrapy_item_ingest.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from sqlalchemy import create_engine, text
|
|
7
|
+
from ..config.settings import Settings, validate_settings
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseExtension:
|
|
13
|
+
"""Base extension with common functionality"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, settings):
|
|
16
|
+
self.settings = settings
|
|
17
|
+
validate_settings(settings)
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def from_crawler(cls, crawler):
|
|
21
|
+
"""Create extension instance from crawler"""
|
|
22
|
+
settings = Settings(crawler.settings)
|
|
23
|
+
return cls(settings)
|
|
24
|
+
|
|
25
|
+
def get_identifier_info(self, spider):
|
|
26
|
+
"""Get identifier column and value for the spider"""
|
|
27
|
+
return self.settings.get_identifier_column(), self.settings.get_identifier_value(spider)
|
|
28
|
+
|
|
29
|
+
def _ensure_logs_table_exists(self, engine):
|
|
30
|
+
"""Create logs table if it doesn't exist (only if create_tables is True)"""
|
|
31
|
+
if not self.settings.create_tables:
|
|
32
|
+
logger.info("Table creation disabled. Skipping logs table creation.")
|
|
33
|
+
return
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
with engine.connect() as connection:
|
|
37
|
+
# Determine the identifier column name
|
|
38
|
+
identifier_column = self.settings.get_identifier_column()
|
|
39
|
+
|
|
40
|
+
# Create logs table with type, message, and timestamp
|
|
41
|
+
logs_table_sql = f"""
|
|
42
|
+
CREATE TABLE IF NOT EXISTS {self.settings.db_logs_table} (
|
|
43
|
+
id SERIAL PRIMARY KEY,
|
|
44
|
+
{identifier_column} VARCHAR(255),
|
|
45
|
+
type VARCHAR(50),
|
|
46
|
+
message TEXT,
|
|
47
|
+
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
48
|
+
)
|
|
49
|
+
"""
|
|
50
|
+
connection.execute(text(logs_table_sql))
|
|
51
|
+
connection.commit()
|
|
52
|
+
logger.info(f"Logs table {self.settings.db_logs_table} created/verified with {identifier_column} column")
|
|
53
|
+
except Exception as e:
|
|
54
|
+
logger.error(f"Failed to create logs table: {e}")
|
|
55
|
+
|
|
56
|
+
def _log_to_database(self, spider, log_level, message):
|
|
57
|
+
"""Helper method to log messages to database"""
|
|
58
|
+
try:
|
|
59
|
+
identifier_column, identifier_value = self.get_identifier_info(spider)
|
|
60
|
+
|
|
61
|
+
engine = create_engine(self.settings.db_url)
|
|
62
|
+
self._ensure_logs_table_exists(engine)
|
|
63
|
+
|
|
64
|
+
stmt = text(f"""
|
|
65
|
+
INSERT INTO {self.settings.db_logs_table}
|
|
66
|
+
({identifier_column}, level, message, timestamp)
|
|
67
|
+
VALUES (:identifier, :type, :message, :timestamp)
|
|
68
|
+
""")
|
|
69
|
+
with engine.connect() as connection:
|
|
70
|
+
connection.execute(stmt, {
|
|
71
|
+
"identifier": identifier_value,
|
|
72
|
+
"type": log_level,
|
|
73
|
+
"message": message,
|
|
74
|
+
"timestamp": datetime.now()
|
|
75
|
+
})
|
|
76
|
+
connection.commit()
|
|
77
|
+
logger.info(f"Logged {log_level} for {identifier_column} {identifier_value}")
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.error(f"Failed to log {log_level}: {e}")
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Logging extension for tracking spider events.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
from scrapy import signals
|
|
6
|
+
from .base import BaseExtension
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LoggingExtension(BaseExtension):
|
|
12
|
+
"""Extension for logging spider events to database"""
|
|
13
|
+
|
|
14
|
+
@classmethod
|
|
15
|
+
def from_crawler(cls, crawler):
|
|
16
|
+
"""Create extension instance from crawler"""
|
|
17
|
+
ext = super().from_crawler(crawler)
|
|
18
|
+
# Connect to spider signals
|
|
19
|
+
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
|
|
20
|
+
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
|
|
21
|
+
crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
|
|
22
|
+
crawler.signals.connect(ext.item_dropped, signal=signals.item_dropped)
|
|
23
|
+
return ext
|
|
24
|
+
|
|
25
|
+
def spider_opened(self, spider):
|
|
26
|
+
"""Called when spider is opened"""
|
|
27
|
+
identifier_column, identifier_value = self.get_identifier_info(spider)
|
|
28
|
+
message = f"{identifier_column.title()} {identifier_value} started"
|
|
29
|
+
self._log_to_database(spider, "SPIDER_OPENED", message)
|
|
30
|
+
|
|
31
|
+
def spider_closed(self, spider, reason):
|
|
32
|
+
"""Called when spider is closed"""
|
|
33
|
+
identifier_column, identifier_value = self.get_identifier_info(spider)
|
|
34
|
+
message = f"{identifier_column.title()} {identifier_value} closed with reason: {reason}"
|
|
35
|
+
self._log_to_database(spider, "SPIDER_CLOSED", message)
|
|
36
|
+
|
|
37
|
+
def spider_error(self, failure, response, spider):
|
|
38
|
+
"""Called when spider encounters an error"""
|
|
39
|
+
message = f"Spider error: {str(failure.value)} on {response.url if response else 'unknown URL'}"
|
|
40
|
+
self._log_to_database(spider, "SPIDER_ERROR", message)
|
|
41
|
+
|
|
42
|
+
def item_dropped(self, item, response, spider, exception):
|
|
43
|
+
"""Called when an item is dropped"""
|
|
44
|
+
message = f"Item dropped: {str(exception)} from {response.url if response else 'unknown URL'}"
|
|
45
|
+
self._log_to_database(spider, "ITEM_DROPPED", message)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base pipeline functionality for scrapy_item_ingest.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
from ..config.settings import Settings, validate_settings
|
|
6
|
+
from ..database.connection import DatabaseConnection
|
|
7
|
+
from ..database.schema import SchemaManager
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BasePipeline:
|
|
13
|
+
"""Base pipeline with common functionality"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, settings):
|
|
16
|
+
self.settings = settings
|
|
17
|
+
self.db = None
|
|
18
|
+
self.schema_manager = None
|
|
19
|
+
validate_settings(settings)
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def from_crawler(cls, crawler):
|
|
23
|
+
"""Create pipeline instance from crawler"""
|
|
24
|
+
settings = Settings(crawler.settings)
|
|
25
|
+
return cls(settings)
|
|
26
|
+
|
|
27
|
+
def open_spider(self, spider):
|
|
28
|
+
"""Called when spider is opened"""
|
|
29
|
+
# Establish database connection
|
|
30
|
+
self.db = DatabaseConnection(self.settings.db_url)
|
|
31
|
+
if not self.db.connect():
|
|
32
|
+
raise Exception("Failed to connect to database")
|
|
33
|
+
|
|
34
|
+
# Initialize schema manager
|
|
35
|
+
self.schema_manager = SchemaManager(self.db, self.settings)
|
|
36
|
+
|
|
37
|
+
# Ensure tables exist
|
|
38
|
+
self.schema_manager.ensure_tables_exist()
|
|
39
|
+
|
|
40
|
+
logger.info(f"Pipeline opened for {self.settings.get_identifier_column()}: {self.settings.get_identifier_value(spider)}")
|
|
41
|
+
|
|
42
|
+
def close_spider(self, spider):
|
|
43
|
+
"""Called when spider is closed"""
|
|
44
|
+
if self.db:
|
|
45
|
+
self.db.close()
|
|
46
|
+
logger.info(f"Pipeline closed for {self.settings.get_identifier_column()}: {self.settings.get_identifier_value(spider)}")
|
|
47
|
+
|
|
48
|
+
def get_identifier_info(self, spider):
|
|
49
|
+
"""Get identifier column and value for the spider"""
|
|
50
|
+
return self.settings.get_identifier_column(), self.settings.get_identifier_value(spider)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Items pipeline for storing scraped items.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from itemadapter import ItemAdapter
|
|
7
|
+
from scrapy.exceptions import DropItem
|
|
8
|
+
from .base import BasePipeline
|
|
9
|
+
from ..utils.serialization import serialize_item_data
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ItemsPipeline(BasePipeline):
|
|
15
|
+
"""Pipeline for handling scraped items"""
|
|
16
|
+
|
|
17
|
+
def process_item(self, item, spider):
|
|
18
|
+
"""Process and store item in database"""
|
|
19
|
+
job_id = self.settings.get_identifier_value(spider)
|
|
20
|
+
|
|
21
|
+
logger.info(f"Processing item for job_id {job_id}: {item}")
|
|
22
|
+
adapter = ItemAdapter(item)
|
|
23
|
+
item_dict = adapter.asdict()
|
|
24
|
+
created_at = datetime.now(timezone.utc)
|
|
25
|
+
|
|
26
|
+
logger.info(f"Item dict prepared: {item_dict}")
|
|
27
|
+
|
|
28
|
+
# Store everything as JSON in the item column
|
|
29
|
+
try:
|
|
30
|
+
sql = f"INSERT INTO {self.settings.db_items_table} (job_id, item, created_at) VALUES (%s, %s, %s)"
|
|
31
|
+
json_data = serialize_item_data(item_dict)
|
|
32
|
+
logger.info(f"Executing SQL: {sql} with JSON data")
|
|
33
|
+
|
|
34
|
+
self.db.execute(sql, (job_id, json_data, created_at))
|
|
35
|
+
self.db.commit()
|
|
36
|
+
logger.info(f"Successfully inserted item for job_id {job_id}")
|
|
37
|
+
except Exception as e:
|
|
38
|
+
logger.error(f"Failed to insert item: {e}")
|
|
39
|
+
self.db.rollback()
|
|
40
|
+
raise DropItem(f"DB insert error: {e}")
|
|
41
|
+
|
|
42
|
+
return item
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main pipeline that combines items and requests functionality.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
from .items import ItemsPipeline
|
|
6
|
+
from .requests import RequestsPipeline
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DbInsertPipeline(ItemsPipeline, RequestsPipeline):
|
|
12
|
+
"""
|
|
13
|
+
Main pipeline that combines item processing and request tracking.
|
|
14
|
+
Inherits from both ItemsPipeline and RequestsPipeline.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, settings):
|
|
18
|
+
# Initialize both parent classes
|
|
19
|
+
ItemsPipeline.__init__(self, settings)
|
|
20
|
+
RequestsPipeline.__init__(self, settings)
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def from_crawler(cls, crawler):
|
|
24
|
+
"""Create pipeline instance from crawler"""
|
|
25
|
+
# Use RequestsPipeline's from_crawler to get signal connections
|
|
26
|
+
return RequestsPipeline.from_crawler.__func__(cls, crawler)
|
|
27
|
+
|
|
28
|
+
def open_spider(self, spider):
|
|
29
|
+
"""Called when spider is opened"""
|
|
30
|
+
# Use the base class implementation
|
|
31
|
+
super().open_spider(spider)
|
|
32
|
+
|
|
33
|
+
def close_spider(self, spider):
|
|
34
|
+
"""Called when spider is closed"""
|
|
35
|
+
# Use the base class implementation
|
|
36
|
+
super().close_spider(spider)
|
|
37
|
+
|
|
38
|
+
def process_item(self, item, spider):
|
|
39
|
+
"""Process and store item in database"""
|
|
40
|
+
# Use ItemsPipeline's process_item method
|
|
41
|
+
return ItemsPipeline.process_item(self, item, spider)
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Requests pipeline for tracking request information.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
import time
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from scrapy import signals
|
|
8
|
+
from .base import BasePipeline
|
|
9
|
+
from ..utils.fingerprint import get_request_fingerprint
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RequestsPipeline(BasePipeline):
|
|
15
|
+
"""Pipeline for handling request tracking"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, settings):
|
|
18
|
+
super().__init__(settings)
|
|
19
|
+
self.request_start_times = {} # Track request start times
|
|
20
|
+
self.request_id_map = {} # Track fingerprint to database ID mapping
|
|
21
|
+
self.url_to_id_map = {} # Track URL to database ID mapping
|
|
22
|
+
self.current_response_url = None # Track current response being processed
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def from_crawler(cls, crawler):
|
|
26
|
+
"""Create pipeline instance from crawler"""
|
|
27
|
+
pipeline = super().from_crawler(crawler)
|
|
28
|
+
# Connect to request signals to automatically log requests
|
|
29
|
+
crawler.signals.connect(pipeline.request_scheduled, signal=signals.request_scheduled)
|
|
30
|
+
crawler.signals.connect(pipeline.response_received, signal=signals.response_received)
|
|
31
|
+
return pipeline
|
|
32
|
+
|
|
33
|
+
def _get_parent_request_info(self, request, spider):
|
|
34
|
+
"""Extract parent request information if available"""
|
|
35
|
+
parent_id = None
|
|
36
|
+
parent_url = None
|
|
37
|
+
|
|
38
|
+
# Get job_id for the current spider
|
|
39
|
+
job_id = self.settings.get_identifier_value(spider)
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
# Method 1: Use current response URL as parent (most reliable)
|
|
43
|
+
if self.current_response_url and self.current_response_url != request.url:
|
|
44
|
+
parent_url = self.current_response_url
|
|
45
|
+
if parent_url in self.url_to_id_map:
|
|
46
|
+
parent_id = self.url_to_id_map[parent_url]
|
|
47
|
+
logger.info(f"Found parent ID {parent_id} from current response URL: {parent_url}")
|
|
48
|
+
|
|
49
|
+
# Method 2: Check request meta for referer
|
|
50
|
+
if not parent_id and hasattr(request, 'meta') and request.meta:
|
|
51
|
+
if 'referer' in request.meta:
|
|
52
|
+
parent_url = request.meta['referer']
|
|
53
|
+
logger.info(f"Found referer in meta: {parent_url}")
|
|
54
|
+
|
|
55
|
+
# Look up in our URL mapping first (faster)
|
|
56
|
+
if parent_url in self.url_to_id_map:
|
|
57
|
+
parent_id = self.url_to_id_map[parent_url]
|
|
58
|
+
logger.info(f"Found parent ID {parent_id} from URL mapping")
|
|
59
|
+
else:
|
|
60
|
+
# Look up in database
|
|
61
|
+
try:
|
|
62
|
+
sql = f"SELECT id FROM {self.settings.db_requests_table} WHERE url = %s AND job_id = %s ORDER BY created_at DESC LIMIT 1"
|
|
63
|
+
result = self.db.execute(sql, (parent_url, job_id))
|
|
64
|
+
if result:
|
|
65
|
+
parent_id = result[0]
|
|
66
|
+
# Cache the result
|
|
67
|
+
self.url_to_id_map[parent_url] = parent_id
|
|
68
|
+
logger.info(f"Found parent ID {parent_id} from database lookup")
|
|
69
|
+
except Exception as e:
|
|
70
|
+
logger.warning(f"Could not look up parent ID by referer URL: {e}")
|
|
71
|
+
|
|
72
|
+
# Debug: Log request meta information
|
|
73
|
+
logger.debug(f"Request URL: {request.url}")
|
|
74
|
+
logger.debug(f"Request meta keys: {list(request.meta.keys()) if request.meta else 'None'}")
|
|
75
|
+
if 'depth' in request.meta:
|
|
76
|
+
logger.debug(f"Request depth: {request.meta['depth']}")
|
|
77
|
+
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.warning(f"Could not extract parent request info: {e}")
|
|
80
|
+
|
|
81
|
+
# If we still don't have parent info, log for debugging
|
|
82
|
+
if not parent_id and not parent_url:
|
|
83
|
+
logger.debug(f"No parent found for request: {request.url}")
|
|
84
|
+
|
|
85
|
+
return parent_id, parent_url
|
|
86
|
+
|
|
87
|
+
def log_request(self, request, spider):
|
|
88
|
+
"""Log request to database"""
|
|
89
|
+
job_id = self.settings.get_identifier_value(spider)
|
|
90
|
+
|
|
91
|
+
logger.info(f"Logging request for job_id {job_id}: {request.url}")
|
|
92
|
+
fingerprint = get_request_fingerprint(request)
|
|
93
|
+
parent_id, parent_url = self._get_parent_request_info(request, spider)
|
|
94
|
+
request_time = time.time()
|
|
95
|
+
created_at = datetime.now(timezone.utc)
|
|
96
|
+
|
|
97
|
+
# Store request start time for duration calculation
|
|
98
|
+
self.request_start_times[fingerprint] = request_time
|
|
99
|
+
|
|
100
|
+
sql = f"""
|
|
101
|
+
INSERT INTO {self.settings.db_requests_table}
|
|
102
|
+
(job_id, url, method, fingerprint, parent_id, parent_url, created_at)
|
|
103
|
+
VALUES (%s, %s, %s, %s, %s, %s, %s) RETURNING id
|
|
104
|
+
"""
|
|
105
|
+
try:
|
|
106
|
+
result = self.db.execute(sql, (
|
|
107
|
+
job_id,
|
|
108
|
+
request.url,
|
|
109
|
+
request.method,
|
|
110
|
+
fingerprint,
|
|
111
|
+
parent_id,
|
|
112
|
+
parent_url,
|
|
113
|
+
created_at
|
|
114
|
+
))
|
|
115
|
+
|
|
116
|
+
# Get the inserted record ID and store it for future parent lookups
|
|
117
|
+
if result:
|
|
118
|
+
record_id = result[0]
|
|
119
|
+
self.request_id_map[fingerprint] = record_id
|
|
120
|
+
self.url_to_id_map[request.url] = record_id # Store URL to ID mapping
|
|
121
|
+
|
|
122
|
+
self.db.commit()
|
|
123
|
+
|
|
124
|
+
log_msg = f"Successfully logged request for job_id {job_id} with fingerprint {fingerprint} (ID: {record_id})"
|
|
125
|
+
if parent_id:
|
|
126
|
+
log_msg += f" (parent ID: {parent_id}, parent URL: {parent_url})"
|
|
127
|
+
else:
|
|
128
|
+
log_msg += " (no parent found)"
|
|
129
|
+
logger.info(log_msg)
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logger.error(f"Failed to log request: {e}")
|
|
132
|
+
self.db.rollback()
|
|
133
|
+
|
|
134
|
+
def request_scheduled(self, request, spider):
|
|
135
|
+
"""Called when a request is scheduled"""
|
|
136
|
+
job_id = self.settings.get_identifier_value(spider)
|
|
137
|
+
logger.info(f"Request scheduled for job_id {job_id}: {request.url}")
|
|
138
|
+
self.log_request(request, spider)
|
|
139
|
+
|
|
140
|
+
def response_received(self, response, request, spider):
|
|
141
|
+
"""Called when a response is received"""
|
|
142
|
+
job_id = self.settings.get_identifier_value(spider)
|
|
143
|
+
|
|
144
|
+
logger.info(f"Response received for job_id {job_id}: {response.url} (status: {response.status})")
|
|
145
|
+
|
|
146
|
+
# Set current response URL for parent tracking
|
|
147
|
+
self.current_response_url = response.url
|
|
148
|
+
|
|
149
|
+
fingerprint = get_request_fingerprint(request)
|
|
150
|
+
response_time = time.time()
|
|
151
|
+
|
|
152
|
+
# Update the request log with response info
|
|
153
|
+
try:
|
|
154
|
+
sql = f"""
|
|
155
|
+
UPDATE {self.settings.db_requests_table}
|
|
156
|
+
SET status_code = %s, response_time = %s
|
|
157
|
+
WHERE job_id = %s AND fingerprint = %s AND status_code IS NULL
|
|
158
|
+
"""
|
|
159
|
+
self.db.execute(sql, (
|
|
160
|
+
response.status,
|
|
161
|
+
response_time,
|
|
162
|
+
job_id,
|
|
163
|
+
fingerprint
|
|
164
|
+
))
|
|
165
|
+
self.db.commit()
|
|
166
|
+
logger.info(f"Updated request status {response.status} and response_time for fingerprint {fingerprint}")
|
|
167
|
+
except Exception as e:
|
|
168
|
+
logger.error(f"Failed to update request status: {e}")
|
|
169
|
+
self.db.rollback()
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Request fingerprint utilities for generating unique request identifiers.
|
|
3
|
+
"""
|
|
4
|
+
import hashlib
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_request_fingerprint(request):
|
|
11
|
+
"""Generate fingerprint for the request"""
|
|
12
|
+
try:
|
|
13
|
+
from scrapy.utils.request import request_fingerprint
|
|
14
|
+
return request_fingerprint(request)
|
|
15
|
+
except Exception as e:
|
|
16
|
+
logger.warning(f"Could not generate fingerprint: {e}")
|
|
17
|
+
# Fallback fingerprint generation
|
|
18
|
+
fingerprint_data = f"{request.method}:{request.url}"
|
|
19
|
+
return hashlib.sha1(fingerprint_data.encode()).hexdigest()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def generate_url_fingerprint(method, url):
|
|
23
|
+
"""Generate a simple fingerprint for URL and method combination"""
|
|
24
|
+
fingerprint_data = f"{method}:{url}"
|
|
25
|
+
return hashlib.sha1(fingerprint_data.encode()).hexdigest()
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Serialization utilities for converting data to JSON-serializable format.
|
|
3
|
+
"""
|
|
4
|
+
import json
|
|
5
|
+
from datetime import datetime, date, time
|
|
6
|
+
from decimal import Decimal
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def serialize_stats(obj):
|
|
10
|
+
"""Recursively convert stats to JSON-serializable format"""
|
|
11
|
+
if isinstance(obj, dict):
|
|
12
|
+
return {key: serialize_stats(value) for key, value in obj.items()}
|
|
13
|
+
elif isinstance(obj, (list, tuple)):
|
|
14
|
+
return [serialize_stats(item) for item in obj]
|
|
15
|
+
elif isinstance(obj, (datetime, date, time)):
|
|
16
|
+
return obj.isoformat()
|
|
17
|
+
elif isinstance(obj, Decimal):
|
|
18
|
+
return float(obj)
|
|
19
|
+
elif isinstance(obj, (int, float, str, bool)) or obj is None:
|
|
20
|
+
return obj
|
|
21
|
+
else:
|
|
22
|
+
# For any other type, convert to string
|
|
23
|
+
return str(obj)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def serialize_item_data(item_dict):
|
|
27
|
+
"""Serialize item data to JSON string"""
|
|
28
|
+
return json.dumps(item_dict, ensure_ascii=False, default=str)
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scrapy_item_ingest
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Scrapy extension for database ingestion with job/spider tracking
|
|
5
|
+
Home-page: https://github.com/fawadss1/scrapy_item_ingest
|
|
6
|
+
Author: Fawad Ali
|
|
7
|
+
Author-email: fawadstar6@gmail.com
|
|
8
|
+
Project-URL: Documentation, https://scrapy-item-ingest.readthedocs.io/
|
|
9
|
+
Project-URL: Source, https://github.com/fawadss1/scrapy_item_ingest
|
|
10
|
+
Project-URL: Tracker, https://github.com/fawadss1/scrapy_item_ingest/issues
|
|
11
|
+
Keywords: scrapy,database,postgresql,web-scraping,data-pipeline
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Framework :: Scrapy
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
25
|
+
Classifier: Topic :: Database
|
|
26
|
+
Requires-Python: >=3.7
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: scrapy>=2.5.0
|
|
30
|
+
Requires-Dist: psycopg2-binary>=2.8.0
|
|
31
|
+
Requires-Dist: itemadapter>=0.6.0
|
|
32
|
+
Requires-Dist: SQLAlchemy>=1.4.0
|
|
33
|
+
Provides-Extra: docs
|
|
34
|
+
Requires-Dist: sphinx>=5.0.0; extra == "docs"
|
|
35
|
+
Requires-Dist: sphinx_rtd_theme>=1.2.0; extra == "docs"
|
|
36
|
+
Requires-Dist: myst-parser>=0.18.0; extra == "docs"
|
|
37
|
+
Requires-Dist: sphinx-autodoc-typehints>=1.19.0; extra == "docs"
|
|
38
|
+
Requires-Dist: sphinx-copybutton>=0.5.0; extra == "docs"
|
|
39
|
+
Provides-Extra: dev
|
|
40
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
41
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
42
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
43
|
+
Requires-Dist: flake8>=5.0.0; extra == "dev"
|
|
44
|
+
Requires-Dist: mypy>=0.991; extra == "dev"
|
|
45
|
+
Requires-Dist: pre-commit>=2.20.0; extra == "dev"
|
|
46
|
+
Provides-Extra: test
|
|
47
|
+
Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
48
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "test"
|
|
49
|
+
Requires-Dist: pytest-mock>=3.8.0; extra == "test"
|
|
50
|
+
Dynamic: author
|
|
51
|
+
Dynamic: author-email
|
|
52
|
+
Dynamic: classifier
|
|
53
|
+
Dynamic: description
|
|
54
|
+
Dynamic: description-content-type
|
|
55
|
+
Dynamic: home-page
|
|
56
|
+
Dynamic: keywords
|
|
57
|
+
Dynamic: license-file
|
|
58
|
+
Dynamic: project-url
|
|
59
|
+
Dynamic: provides-extra
|
|
60
|
+
Dynamic: requires-dist
|
|
61
|
+
Dynamic: requires-python
|
|
62
|
+
Dynamic: summary
|
|
63
|
+
|
|
64
|
+
# Scrapy Item Ingest
|
|
65
|
+
|
|
66
|
+
[](https://pypi.org/project/scrapy-item-ingest/)
|
|
67
|
+
[](https://pypi.org/project/scrapy-item-ingest/)
|
|
68
|
+
[](https://pypi.org/project/scrapy-item-ingest/)
|
|
69
|
+
[](https://opensource.org/licenses/MIT)
|
|
70
|
+
|
|
71
|
+
[](https://github.com/fawadss1/scrapy_item_ingest/stargazers)
|
|
72
|
+
[](https://github.com/fawadss1/scrapy_item_ingest/issues)
|
|
73
|
+
[](https://github.com/fawadss1/scrapy_item_ingest/commits)
|
|
74
|
+
|
|
75
|
+
A comprehensive Scrapy extension for ingesting scraped items, requests, and logs into PostgreSQL databases with advanced tracking capabilities. This library provides a clean, production-ready solution for storing and monitoring your Scrapy crawling operations with real-time data ingestion and comprehensive logging.
|
|
76
|
+
|
|
77
|
+
## Documentation
|
|
78
|
+
|
|
79
|
+
Full documentation is available at: [https://scrapy-item-ingest.readthedocs.io/en/latest/](https://scrapy-item-ingest.readthedocs.io/en/latest/)
|
|
80
|
+
|
|
81
|
+
## Key Features
|
|
82
|
+
|
|
83
|
+
- 🔄 **Real-time Data Ingestion**: Store items, requests, and logs as they're processed
|
|
84
|
+
- 📊 **Request Tracking**: Track request response times, fingerprints, and parent-child relationships
|
|
85
|
+
- 🔍 **Comprehensive Logging**: Capture spider events, errors, and custom messages
|
|
86
|
+
- 🏗️ **Flexible Schema**: Support for both auto-creation and existing table modes
|
|
87
|
+
- ⚙️ **Modular Design**: Use individual components or the complete pipeline
|
|
88
|
+
- 🛡️ **Production Ready**: Handles both development and production scenarios
|
|
89
|
+
- 📝 **JSONB Storage**: Store complex item data as JSONB for flexible querying
|
|
90
|
+
- 🐳 **Docker Support**: Complete containerization with Docker and Kubernetes
|
|
91
|
+
- 📈 **Performance Optimized**: Connection pooling and batch processing
|
|
92
|
+
- 🔧 **Easy Configuration**: Environment-based configuration with validation
|
|
93
|
+
- 📊 **Monitoring Ready**: Built-in metrics and health checks
|
|
94
|
+
|
|
95
|
+
## Installation
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
pip install scrapy-item-ingest
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Development
|
|
102
|
+
|
|
103
|
+
### Setting up for Development
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
git clone https://github.com/fawadss1/scrapy_item_ingest.git
|
|
107
|
+
cd scrapy_item_ingest
|
|
108
|
+
pip install -e ".[dev]"
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## License
|
|
112
|
+
|
|
113
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
114
|
+
|
|
115
|
+
## Support
|
|
116
|
+
|
|
117
|
+
For support and questions:
|
|
118
|
+
|
|
119
|
+
- **Email**: fawadstar6@gmail.com
|
|
120
|
+
- **Documentation**: [https://scrapy-item-ingest.readthedocs.io/](https://scrapy-item-ingest.readthedocs.io/)
|
|
121
|
+
- **Issues**: Please report bugs and feature requests at [GitHub Issues](https://github.com/fawadss1/scrapy_item_ingest/issues)
|
|
122
|
+
|
|
123
|
+
## Changelog
|
|
124
|
+
|
|
125
|
+
### v0.1.0 (Current)
|
|
126
|
+
|
|
127
|
+
- Initial release
|
|
128
|
+
- Core pipeline functionality for items, requests, and logs
|
|
129
|
+
- PostgreSQL database integration with JSONB storage
|
|
130
|
+
- Comprehensive documentation and examples
|
|
131
|
+
- Production deployment guides
|
|
132
|
+
- Docker and Kubernetes support
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
scrapy_item_ingest/__init__.py,sha256=2Cj2dnPpsGOTGH95ihMAdwASfT-Nhw3i4M5zfxLfgaw,1507
|
|
2
|
+
scrapy_item_ingest/config/__init__.py,sha256=Foyt52_KDRIoDZtSH5ttcWxQXCOUgzebo4IGCPQwriY,55
|
|
3
|
+
scrapy_item_ingest/config/settings.py,sha256=5GFQAqRf-6oc4KaMdMGuzLdtSP6fmn67anzGnKpejTI,2619
|
|
4
|
+
scrapy_item_ingest/database/__init__.py,sha256=-D9cfI8Hrap74UkIUmcOZ-ikAZ8HKSswZAZMBtjq69A,50
|
|
5
|
+
scrapy_item_ingest/database/connection.py,sha256=bvSTCQfgBMcuKu-VzMCwMtSNBORzeeaYKm63eRtwKz8,2027
|
|
6
|
+
scrapy_item_ingest/database/schema.py,sha256=2HcBbW3VIWva59YCxyAinwZQDidFuyU5zuOCdCwBZUI,2866
|
|
7
|
+
scrapy_item_ingest/extensions/__init__.py,sha256=G8xe0Bssf4jFvi3D1gNyOpylaDHlo-RKHEX9_tIB2f8,51
|
|
8
|
+
scrapy_item_ingest/extensions/base.py,sha256=aodB_V47O8ihox2VdDizclAhQ6VonAUZ4lIOitub7kw,3192
|
|
9
|
+
scrapy_item_ingest/extensions/logging.py,sha256=pI-MlQfNLHFmNuOSp60SfFAGRxJwPwurZp1xt88PeWU,2017
|
|
10
|
+
scrapy_item_ingest/pipelines/__init__.py,sha256=NvbUeLCwjFPvVaSTzdnN6LkToJ1ISAM91EmVero9FXo,50
|
|
11
|
+
scrapy_item_ingest/pipelines/base.py,sha256=C6lk37lhNr6oADXofi4aqnN0cZ9u1KrWxEZMp6Oo7oA,1783
|
|
12
|
+
scrapy_item_ingest/pipelines/items.py,sha256=FcgzmuFJ3qPVVQDZv9OjEPjlSbbW5MZykMmhAcdZ4Tc,1487
|
|
13
|
+
scrapy_item_ingest/pipelines/main.py,sha256=udABcEJPsJRdOFWZckP03UUUIJrHUHlfOttookdClMI,1383
|
|
14
|
+
scrapy_item_ingest/pipelines/requests.py,sha256=3Wyzx6kgf7B_gg2DC0jhekCPSIEaJiRJquhdWyLrHBs,7427
|
|
15
|
+
scrapy_item_ingest/utils/__init__.py,sha256=xuzfL-u3NkFElIrBygQISYv0CKMdSVvreuL16JXZMRM,49
|
|
16
|
+
scrapy_item_ingest/utils/fingerprint.py,sha256=Qdby72nLNQp4-sxL51RM85MuxwFJHxmuYDbQv1c7hPc,855
|
|
17
|
+
scrapy_item_ingest/utils/serialization.py,sha256=iKGhWnVwMKLKZ63kek4Hov9ESy9igA13CuOfDRD1W-M,942
|
|
18
|
+
scrapy_item_ingest-0.1.0.dist-info/licenses/LICENSE,sha256=DhJQ4_j45c_DWghISLKmJshcLvX_Pr7QXaahe2iRMNo,1087
|
|
19
|
+
scrapy_item_ingest-0.1.0.dist-info/METADATA,sha256=eiZF00MlFqNBdExA4kW2i4C_p_TvLsd4sLAsFE4uiD4,5868
|
|
20
|
+
scrapy_item_ingest-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
21
|
+
scrapy_item_ingest-0.1.0.dist-info/entry_points.txt,sha256=WKFpo9Dy0qX1S1PT8NvIHqZmSxBCgyAM480LnLR8S1E,172
|
|
22
|
+
scrapy_item_ingest-0.1.0.dist-info/top_level.txt,sha256=bu2ekFWcSH0ANdc8oGDdmZXaSC6kNuhtC-AggLsUQCU,19
|
|
23
|
+
scrapy_item_ingest-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Fawad Ali
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
scrapy_item_ingest
|