scrapy-item-ingest 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of scrapy-item-ingest might be problematic. Click here for more details.

Files changed (29) hide show
  1. scrapy_item_ingest-0.1.0/LICENSE +21 -0
  2. scrapy_item_ingest-0.1.0/PKG-INFO +132 -0
  3. scrapy_item_ingest-0.1.0/README.md +69 -0
  4. scrapy_item_ingest-0.1.0/scrapy_item_ingest/__init__.py +48 -0
  5. scrapy_item_ingest-0.1.0/scrapy_item_ingest/config/__init__.py +2 -0
  6. scrapy_item_ingest-0.1.0/scrapy_item_ingest/config/settings.py +82 -0
  7. scrapy_item_ingest-0.1.0/scrapy_item_ingest/database/__init__.py +2 -0
  8. scrapy_item_ingest-0.1.0/scrapy_item_ingest/database/connection.py +67 -0
  9. scrapy_item_ingest-0.1.0/scrapy_item_ingest/database/schema.py +79 -0
  10. scrapy_item_ingest-0.1.0/scrapy_item_ingest/extensions/__init__.py +2 -0
  11. scrapy_item_ingest-0.1.0/scrapy_item_ingest/extensions/base.py +79 -0
  12. scrapy_item_ingest-0.1.0/scrapy_item_ingest/extensions/logging.py +45 -0
  13. scrapy_item_ingest-0.1.0/scrapy_item_ingest/pipelines/__init__.py +2 -0
  14. scrapy_item_ingest-0.1.0/scrapy_item_ingest/pipelines/base.py +50 -0
  15. scrapy_item_ingest-0.1.0/scrapy_item_ingest/pipelines/items.py +42 -0
  16. scrapy_item_ingest-0.1.0/scrapy_item_ingest/pipelines/main.py +41 -0
  17. scrapy_item_ingest-0.1.0/scrapy_item_ingest/pipelines/requests.py +169 -0
  18. scrapy_item_ingest-0.1.0/scrapy_item_ingest/utils/__init__.py +2 -0
  19. scrapy_item_ingest-0.1.0/scrapy_item_ingest/utils/fingerprint.py +25 -0
  20. scrapy_item_ingest-0.1.0/scrapy_item_ingest/utils/serialization.py +28 -0
  21. scrapy_item_ingest-0.1.0/scrapy_item_ingest.egg-info/PKG-INFO +132 -0
  22. scrapy_item_ingest-0.1.0/scrapy_item_ingest.egg-info/SOURCES.txt +27 -0
  23. scrapy_item_ingest-0.1.0/scrapy_item_ingest.egg-info/dependency_links.txt +1 -0
  24. scrapy_item_ingest-0.1.0/scrapy_item_ingest.egg-info/entry_points.txt +5 -0
  25. scrapy_item_ingest-0.1.0/scrapy_item_ingest.egg-info/not-zip-safe +1 -0
  26. scrapy_item_ingest-0.1.0/scrapy_item_ingest.egg-info/requires.txt +24 -0
  27. scrapy_item_ingest-0.1.0/scrapy_item_ingest.egg-info/top_level.txt +1 -0
  28. scrapy_item_ingest-0.1.0/setup.cfg +4 -0
  29. scrapy_item_ingest-0.1.0/setup.py +81 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Fawad Ali
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,132 @@
1
+ Metadata-Version: 2.4
2
+ Name: scrapy_item_ingest
3
+ Version: 0.1.0
4
+ Summary: Scrapy extension for database ingestion with job/spider tracking
5
+ Home-page: https://github.com/fawadss1/scrapy_item_ingest
6
+ Author: Fawad Ali
7
+ Author-email: fawadstar6@gmail.com
8
+ Project-URL: Documentation, https://scrapy-item-ingest.readthedocs.io/
9
+ Project-URL: Source, https://github.com/fawadss1/scrapy_item_ingest
10
+ Project-URL: Tracker, https://github.com/fawadss1/scrapy_item_ingest/issues
11
+ Keywords: scrapy,database,postgresql,web-scraping,data-pipeline
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.7
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Framework :: Scrapy
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Classifier: Topic :: Internet :: WWW/HTTP
25
+ Classifier: Topic :: Database
26
+ Requires-Python: >=3.7
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: scrapy>=2.5.0
30
+ Requires-Dist: psycopg2-binary>=2.8.0
31
+ Requires-Dist: itemadapter>=0.6.0
32
+ Requires-Dist: SQLAlchemy>=1.4.0
33
+ Provides-Extra: docs
34
+ Requires-Dist: sphinx>=5.0.0; extra == "docs"
35
+ Requires-Dist: sphinx_rtd_theme>=1.2.0; extra == "docs"
36
+ Requires-Dist: myst-parser>=0.18.0; extra == "docs"
37
+ Requires-Dist: sphinx-autodoc-typehints>=1.19.0; extra == "docs"
38
+ Requires-Dist: sphinx-copybutton>=0.5.0; extra == "docs"
39
+ Provides-Extra: dev
40
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
41
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
42
+ Requires-Dist: black>=22.0.0; extra == "dev"
43
+ Requires-Dist: flake8>=5.0.0; extra == "dev"
44
+ Requires-Dist: mypy>=0.991; extra == "dev"
45
+ Requires-Dist: pre-commit>=2.20.0; extra == "dev"
46
+ Provides-Extra: test
47
+ Requires-Dist: pytest>=7.0.0; extra == "test"
48
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
49
+ Requires-Dist: pytest-mock>=3.8.0; extra == "test"
50
+ Dynamic: author
51
+ Dynamic: author-email
52
+ Dynamic: classifier
53
+ Dynamic: description
54
+ Dynamic: description-content-type
55
+ Dynamic: home-page
56
+ Dynamic: keywords
57
+ Dynamic: license-file
58
+ Dynamic: project-url
59
+ Dynamic: provides-extra
60
+ Dynamic: requires-dist
61
+ Dynamic: requires-python
62
+ Dynamic: summary
63
+
64
+ # Scrapy Item Ingest
65
+
66
+ [![PyPI Version](https://img.shields.io/pypi/v/scrapy-item-ingest.svg)](https://pypi.org/project/scrapy-item-ingest/)
67
+ [![PyPI Downloads](https://img.shields.io/pypi/dm/scrapy-item-ingest.svg)](https://pypi.org/project/scrapy-item-ingest/)
68
+ [![Supported Python Versions](https://img.shields.io/pypi/pyversions/scrapy-item-ingest.svg)](https://pypi.org/project/scrapy-item-ingest/)
69
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
70
+
71
+ [![GitHub Stars](https://img.shields.io/github/stars/fawadss1/scrapy_item_ingest.svg)](https://github.com/fawadss1/scrapy_item_ingest/stargazers)
72
+ [![GitHub Issues](https://img.shields.io/github/issues/fawadss1/scrapy_item_ingest.svg)](https://github.com/fawadss1/scrapy_item_ingest/issues)
73
+ [![GitHub Last Commit](https://img.shields.io/github/last-commit/fawadss1/scrapy_item_ingest.svg)](https://github.com/fawadss1/scrapy_item_ingest/commits)
74
+
75
+ A comprehensive Scrapy extension for ingesting scraped items, requests, and logs into PostgreSQL databases with advanced tracking capabilities. This library provides a clean, production-ready solution for storing and monitoring your Scrapy crawling operations with real-time data ingestion and comprehensive logging.
76
+
77
+ ## Documentation
78
+
79
+ Full documentation is available at: [https://scrapy-item-ingest.readthedocs.io/en/latest/](https://scrapy-item-ingest.readthedocs.io/en/latest/)
80
+
81
+ ## Key Features
82
+
83
+ - 🔄 **Real-time Data Ingestion**: Store items, requests, and logs as they're processed
84
+ - 📊 **Request Tracking**: Track request response times, fingerprints, and parent-child relationships
85
+ - 🔍 **Comprehensive Logging**: Capture spider events, errors, and custom messages
86
+ - 🏗️ **Flexible Schema**: Support for both auto-creation and existing table modes
87
+ - ⚙️ **Modular Design**: Use individual components or the complete pipeline
88
+ - 🛡️ **Production Ready**: Handles both development and production scenarios
89
+ - 📝 **JSONB Storage**: Store complex item data as JSONB for flexible querying
90
+ - 🐳 **Docker Support**: Complete containerization with Docker and Kubernetes
91
+ - 📈 **Performance Optimized**: Connection pooling and batch processing
92
+ - 🔧 **Easy Configuration**: Environment-based configuration with validation
93
+ - 📊 **Monitoring Ready**: Built-in metrics and health checks
94
+
95
+ ## Installation
96
+
97
+ ```bash
98
+ pip install scrapy-item-ingest
99
+ ```
100
+
101
+ ## Development
102
+
103
+ ### Setting up for Development
104
+
105
+ ```bash
106
+ git clone https://github.com/fawadss1/scrapy_item_ingest.git
107
+ cd scrapy_item_ingest
108
+ pip install -e ".[dev]"
109
+ ```
110
+
111
+ ## License
112
+
113
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
114
+
115
+ ## Support
116
+
117
+ For support and questions:
118
+
119
+ - **Email**: fawadstar6@gmail.com
120
+ - **Documentation**: [https://scrapy-item-ingest.readthedocs.io/](https://scrapy-item-ingest.readthedocs.io/)
121
+ - **Issues**: Please report bugs and feature requests at [GitHub Issues](https://github.com/fawadss1/scrapy_item_ingest/issues)
122
+
123
+ ## Changelog
124
+
125
+ ### v0.1.0 (Current)
126
+
127
+ - Initial release
128
+ - Core pipeline functionality for items, requests, and logs
129
+ - PostgreSQL database integration with JSONB storage
130
+ - Comprehensive documentation and examples
131
+ - Production deployment guides
132
+ - Docker and Kubernetes support
@@ -0,0 +1,69 @@
1
+ # Scrapy Item Ingest
2
+
3
+ [![PyPI Version](https://img.shields.io/pypi/v/scrapy-item-ingest.svg)](https://pypi.org/project/scrapy-item-ingest/)
4
+ [![PyPI Downloads](https://img.shields.io/pypi/dm/scrapy-item-ingest.svg)](https://pypi.org/project/scrapy-item-ingest/)
5
+ [![Supported Python Versions](https://img.shields.io/pypi/pyversions/scrapy-item-ingest.svg)](https://pypi.org/project/scrapy-item-ingest/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+
8
+ [![GitHub Stars](https://img.shields.io/github/stars/fawadss1/scrapy_item_ingest.svg)](https://github.com/fawadss1/scrapy_item_ingest/stargazers)
9
+ [![GitHub Issues](https://img.shields.io/github/issues/fawadss1/scrapy_item_ingest.svg)](https://github.com/fawadss1/scrapy_item_ingest/issues)
10
+ [![GitHub Last Commit](https://img.shields.io/github/last-commit/fawadss1/scrapy_item_ingest.svg)](https://github.com/fawadss1/scrapy_item_ingest/commits)
11
+
12
+ A comprehensive Scrapy extension for ingesting scraped items, requests, and logs into PostgreSQL databases with advanced tracking capabilities. This library provides a clean, production-ready solution for storing and monitoring your Scrapy crawling operations with real-time data ingestion and comprehensive logging.
13
+
14
+ ## Documentation
15
+
16
+ Full documentation is available at: [https://scrapy-item-ingest.readthedocs.io/en/latest/](https://scrapy-item-ingest.readthedocs.io/en/latest/)
17
+
18
+ ## Key Features
19
+
20
+ - 🔄 **Real-time Data Ingestion**: Store items, requests, and logs as they're processed
21
+ - 📊 **Request Tracking**: Track request response times, fingerprints, and parent-child relationships
22
+ - 🔍 **Comprehensive Logging**: Capture spider events, errors, and custom messages
23
+ - 🏗️ **Flexible Schema**: Support for both auto-creation and existing table modes
24
+ - ⚙️ **Modular Design**: Use individual components or the complete pipeline
25
+ - 🛡️ **Production Ready**: Handles both development and production scenarios
26
+ - 📝 **JSONB Storage**: Store complex item data as JSONB for flexible querying
27
+ - 🐳 **Docker Support**: Complete containerization with Docker and Kubernetes
28
+ - 📈 **Performance Optimized**: Connection pooling and batch processing
29
+ - 🔧 **Easy Configuration**: Environment-based configuration with validation
30
+ - 📊 **Monitoring Ready**: Built-in metrics and health checks
31
+
32
+ ## Installation
33
+
34
+ ```bash
35
+ pip install scrapy-item-ingest
36
+ ```
37
+
38
+ ## Development
39
+
40
+ ### Setting up for Development
41
+
42
+ ```bash
43
+ git clone https://github.com/fawadss1/scrapy_item_ingest.git
44
+ cd scrapy_item_ingest
45
+ pip install -e ".[dev]"
46
+ ```
47
+
48
+ ## License
49
+
50
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
51
+
52
+ ## Support
53
+
54
+ For support and questions:
55
+
56
+ - **Email**: fawadstar6@gmail.com
57
+ - **Documentation**: [https://scrapy-item-ingest.readthedocs.io/](https://scrapy-item-ingest.readthedocs.io/)
58
+ - **Issues**: Please report bugs and feature requests at [GitHub Issues](https://github.com/fawadss1/scrapy_item_ingest/issues)
59
+
60
+ ## Changelog
61
+
62
+ ### v0.1.0 (Current)
63
+
64
+ - Initial release
65
+ - Core pipeline functionality for items, requests, and logs
66
+ - PostgreSQL database integration with JSONB storage
67
+ - Comprehensive documentation and examples
68
+ - Production deployment guides
69
+ - Docker and Kubernetes support
@@ -0,0 +1,48 @@
1
+ """
2
+ scrapy_item_ingest - A Scrapy extension for ingesting items and requests into databases.
3
+
4
+ This package provides pipelines and extensions for storing scraped data, tracking requests,
5
+ and logging spider events to PostgreSQL databases with support for both spider-based and
6
+ job-based identification.
7
+
8
+ Main Components:
9
+ - DbInsertPipeline: Combined pipeline for items and requests
10
+ - LoggingExtension: Extension for logging spider events
11
+ - ItemsPipeline: Standalone items processing pipeline
12
+ - RequestsPipeline: Standalone requests tracking pipeline
13
+ """
14
+
15
+ __version__ = "0.1.0"
16
+ __author__ = "Fawad Ali"
17
+ __description__ = "Scrapy extension for database ingestion with job/spider tracking"
18
+
19
+ # Import main classes directly from organized modules
20
+ from .pipelines.main import DbInsertPipeline
21
+ from .extensions.logging import LoggingExtension
22
+
23
+ # Import individual components for advanced users
24
+ from .pipelines.items import ItemsPipeline
25
+ from .pipelines.requests import RequestsPipeline
26
+
27
+ # Import configuration utilities
28
+ from .config.settings import Settings, validate_settings
29
+
30
+ # Define what gets imported with "from scrapy_item_ingest import *"
31
+ __all__ = [
32
+ # Main classes (most commonly used)
33
+ 'DbInsertPipeline',
34
+ 'LoggingExtension',
35
+
36
+ # Individual components
37
+ 'ItemsPipeline',
38
+ 'RequestsPipeline',
39
+
40
+ # Configuration
41
+ 'Settings',
42
+ 'validate_settings',
43
+
44
+ # Package metadata
45
+ '__version__',
46
+ '__author__',
47
+ '__description__',
48
+ ]
@@ -0,0 +1,2 @@
1
+ """Configuration modules for scrapy_item_ingest."""
2
+
@@ -0,0 +1,82 @@
1
+ """
2
+ Configuration settings and utilities for scrapy_item_ingest.
3
+ """
4
+
5
+ # Static table names - no longer configurable
6
+ DEFAULT_ITEMS_TABLE = 'job_items'
7
+ DEFAULT_REQUESTS_TABLE = 'job_requests'
8
+ DEFAULT_LOGS_TABLE = 'job_logs'
9
+
10
+
11
+ class Settings:
12
+ """Settings class to handle configuration options"""
13
+
14
+ def __init__(self, crawler_settings):
15
+ self.crawler_settings = crawler_settings
16
+
17
+ @property
18
+ def db_url(self):
19
+ return self.crawler_settings.get('DB_URL')
20
+
21
+ @property
22
+ def db_type(self):
23
+ return self.crawler_settings.get('DB_TYPE', 'postgres')
24
+
25
+ @property
26
+ def db_items_table(self):
27
+ """Return static table name for items"""
28
+ return DEFAULT_ITEMS_TABLE
29
+
30
+ @property
31
+ def db_requests_table(self):
32
+ """Return static table name for requests"""
33
+ return DEFAULT_REQUESTS_TABLE
34
+
35
+ @property
36
+ def db_logs_table(self):
37
+ """Return static table name for logs"""
38
+ return DEFAULT_LOGS_TABLE
39
+
40
+ @property
41
+ def create_tables(self):
42
+ return self.crawler_settings.getbool('CREATE_TABLES', True)
43
+
44
+ @property
45
+ def use_job_id(self):
46
+ # JOB_ID only works when CREATE_TABLES = False
47
+ if self.create_tables:
48
+ return False # Don't use JOB_ID when creating tables
49
+ else:
50
+ return True # Use JOB_ID when using existing tables
51
+
52
+ @property
53
+ def job_id(self):
54
+ # Always return JOB_ID or fallback to None (spider name will be used)
55
+ return self.crawler_settings.get('JOB_ID', None)
56
+
57
+ def get_identifier_column(self):
58
+ """Get the identifier column name based on mode"""
59
+ if self.create_tables:
60
+ return "spider" # Use spider column when creating tables
61
+ else:
62
+ return "job_id" # Use job_id column when using existing tables
63
+
64
+ def get_identifier_value(self, spider):
65
+ """Get the identifier value with smart fallback"""
66
+ job_id = self.crawler_settings.get('JOB_ID', None)
67
+
68
+ if self.create_tables:
69
+ # When creating tables, use JOB_ID if provided, else spider name
70
+ return job_id if job_id else spider.name
71
+ else:
72
+ # When using existing tables, use JOB_ID if provided, else spider name
73
+ return job_id if job_id else spider.name
74
+
75
+
76
+ def validate_settings(settings):
77
+ """Validate configuration settings"""
78
+ if not settings.db_url:
79
+ raise ValueError("DB_URL must be set in settings")
80
+
81
+ # Job ID is now optional - will use spider name as fallback
82
+ return True
@@ -0,0 +1,2 @@
1
+ """Database modules for scrapy_item_ingest."""
2
+
@@ -0,0 +1,67 @@
1
+ """
2
+ Database connection utilities for scrapy_item_ingest.
3
+ """
4
+ import psycopg2
5
+ import logging
6
+ from urllib.parse import urlparse, unquote
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class DatabaseConnection:
12
+ """Database connection manager"""
13
+
14
+ def __init__(self, db_url):
15
+ self.db_url = db_url
16
+ self.conn = None
17
+ self.cur = None
18
+
19
+ def connect(self):
20
+ """Establish database connection"""
21
+ try:
22
+ result = urlparse(self.db_url)
23
+ user = result.username
24
+ password = unquote(result.password) if result.password else None
25
+ host = result.hostname
26
+ port = result.port
27
+ dbname = result.path.lstrip('/')
28
+
29
+ self.conn = psycopg2.connect(
30
+ host=host, port=port, dbname=dbname,
31
+ user=user, password=password
32
+ )
33
+ self.cur = self.conn.cursor()
34
+ logger.info("Database connection established")
35
+ return True
36
+ except Exception as e:
37
+ logger.error(f"Failed to connect to database: {e}")
38
+ return False
39
+
40
+ def close(self):
41
+ """Close database connection"""
42
+ if hasattr(self, 'cur') and self.cur:
43
+ self.cur.close()
44
+ if hasattr(self, 'conn') and self.conn:
45
+ self.conn.close()
46
+ logger.info("Database connection closed")
47
+
48
+ def execute(self, sql, params=None):
49
+ """Execute SQL query"""
50
+ try:
51
+ if params:
52
+ self.cur.execute(sql, params)
53
+ else:
54
+ self.cur.execute(sql)
55
+ return self.cur.fetchone() if self.cur.description else None
56
+ except Exception as e:
57
+ logger.error(f"Failed to execute query: {e}")
58
+ self.conn.rollback()
59
+ raise
60
+
61
+ def commit(self):
62
+ """Commit transaction"""
63
+ self.conn.commit()
64
+
65
+ def rollback(self):
66
+ """Rollback transaction"""
67
+ self.conn.rollback()
@@ -0,0 +1,79 @@
1
+ """
2
+ Database schema management utilities for scrapy_item_ingest.
3
+ """
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class SchemaManager:
10
+ """Database schema management"""
11
+
12
+ def __init__(self, db_connection, settings):
13
+ self.db = db_connection
14
+ self.settings = settings
15
+
16
+ def create_items_table(self):
17
+ """Create items table if it doesn't exist"""
18
+ items_table_sql = f"""
19
+ CREATE TABLE IF NOT EXISTS {self.settings.db_items_table} (
20
+ id SERIAL PRIMARY KEY,
21
+ job_id VARCHAR(255),
22
+ item JSONB,
23
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
24
+ )
25
+ """
26
+ self.db.execute(items_table_sql)
27
+ logger.info(f"Items table {self.settings.db_items_table} created/verified with job_id column")
28
+
29
+ def create_requests_table(self):
30
+ """Create requests table if it doesn't exist"""
31
+ requests_table_sql = f"""
32
+ CREATE TABLE IF NOT EXISTS {self.settings.db_requests_table} (
33
+ id SERIAL PRIMARY KEY,
34
+ job_id VARCHAR(255),
35
+ url TEXT,
36
+ method VARCHAR(10),
37
+ status_code INTEGER,
38
+ duration FLOAT,
39
+ response_time FLOAT,
40
+ fingerprint VARCHAR(64),
41
+ parent_id INTEGER,
42
+ parent_url TEXT,
43
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
44
+ FOREIGN KEY (parent_id) REFERENCES {self.settings.db_requests_table}(id)
45
+ )
46
+ """
47
+ self.db.execute(requests_table_sql)
48
+ logger.info(f"Requests table {self.settings.db_requests_table} created/verified with job_id column")
49
+
50
+ def create_logs_table(self):
51
+ """Create logs table if it doesn't exist"""
52
+ logs_table_sql = f"""
53
+ CREATE TABLE IF NOT EXISTS {self.settings.db_logs_table} (
54
+ id SERIAL PRIMARY KEY,
55
+ job_id VARCHAR(255),
56
+ level VARCHAR(50),
57
+ message TEXT,
58
+ timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
59
+ )
60
+ """
61
+ self.db.execute(logs_table_sql)
62
+ logger.info(f"Logs table {self.settings.db_logs_table} created/verified with job_id column")
63
+
64
+ def ensure_tables_exist(self):
65
+ """Create all tables if they don't exist (only if create_tables is True)"""
66
+ if not self.settings.create_tables:
67
+ logger.info("Table creation disabled. Skipping table creation.")
68
+ return
69
+
70
+ try:
71
+ self.create_items_table()
72
+ self.create_requests_table()
73
+ self.create_logs_table()
74
+ self.db.commit()
75
+ logger.info("All tables created/verified successfully")
76
+ except Exception as e:
77
+ logger.error(f"Failed to create tables: {e}")
78
+ self.db.rollback()
79
+ raise
@@ -0,0 +1,2 @@
1
+ """Extension modules for scrapy_item_ingest."""
2
+
@@ -0,0 +1,79 @@
1
+ """
2
+ Base extension functionality for scrapy_item_ingest.
3
+ """
4
+ import logging
5
+ from datetime import datetime
6
+ from sqlalchemy import create_engine, text
7
+ from ..config.settings import Settings, validate_settings
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class BaseExtension:
13
+ """Base extension with common functionality"""
14
+
15
+ def __init__(self, settings):
16
+ self.settings = settings
17
+ validate_settings(settings)
18
+
19
+ @classmethod
20
+ def from_crawler(cls, crawler):
21
+ """Create extension instance from crawler"""
22
+ settings = Settings(crawler.settings)
23
+ return cls(settings)
24
+
25
+ def get_identifier_info(self, spider):
26
+ """Get identifier column and value for the spider"""
27
+ return self.settings.get_identifier_column(), self.settings.get_identifier_value(spider)
28
+
29
+ def _ensure_logs_table_exists(self, engine):
30
+ """Create logs table if it doesn't exist (only if create_tables is True)"""
31
+ if not self.settings.create_tables:
32
+ logger.info("Table creation disabled. Skipping logs table creation.")
33
+ return
34
+
35
+ try:
36
+ with engine.connect() as connection:
37
+ # Determine the identifier column name
38
+ identifier_column = self.settings.get_identifier_column()
39
+
40
+ # Create logs table with type, message, and timestamp
41
+ logs_table_sql = f"""
42
+ CREATE TABLE IF NOT EXISTS {self.settings.db_logs_table} (
43
+ id SERIAL PRIMARY KEY,
44
+ {identifier_column} VARCHAR(255),
45
+ type VARCHAR(50),
46
+ message TEXT,
47
+ timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
48
+ )
49
+ """
50
+ connection.execute(text(logs_table_sql))
51
+ connection.commit()
52
+ logger.info(f"Logs table {self.settings.db_logs_table} created/verified with {identifier_column} column")
53
+ except Exception as e:
54
+ logger.error(f"Failed to create logs table: {e}")
55
+
56
+ def _log_to_database(self, spider, log_level, message):
57
+ """Helper method to log messages to database"""
58
+ try:
59
+ identifier_column, identifier_value = self.get_identifier_info(spider)
60
+
61
+ engine = create_engine(self.settings.db_url)
62
+ self._ensure_logs_table_exists(engine)
63
+
64
+ stmt = text(f"""
65
+ INSERT INTO {self.settings.db_logs_table}
66
+ ({identifier_column}, level, message, timestamp)
67
+ VALUES (:identifier, :type, :message, :timestamp)
68
+ """)
69
+ with engine.connect() as connection:
70
+ connection.execute(stmt, {
71
+ "identifier": identifier_value,
72
+ "type": log_level,
73
+ "message": message,
74
+ "timestamp": datetime.now()
75
+ })
76
+ connection.commit()
77
+ logger.info(f"Logged {log_level} for {identifier_column} {identifier_value}")
78
+ except Exception as e:
79
+ logger.error(f"Failed to log {log_level}: {e}")
@@ -0,0 +1,45 @@
1
+ """
2
+ Logging extension for tracking spider events.
3
+ """
4
+ import logging
5
+ from scrapy import signals
6
+ from .base import BaseExtension
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class LoggingExtension(BaseExtension):
12
+ """Extension for logging spider events to database"""
13
+
14
+ @classmethod
15
+ def from_crawler(cls, crawler):
16
+ """Create extension instance from crawler"""
17
+ ext = super().from_crawler(crawler)
18
+ # Connect to spider signals
19
+ crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
20
+ crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
21
+ crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
22
+ crawler.signals.connect(ext.item_dropped, signal=signals.item_dropped)
23
+ return ext
24
+
25
+ def spider_opened(self, spider):
26
+ """Called when spider is opened"""
27
+ identifier_column, identifier_value = self.get_identifier_info(spider)
28
+ message = f"{identifier_column.title()} {identifier_value} started"
29
+ self._log_to_database(spider, "SPIDER_OPENED", message)
30
+
31
+ def spider_closed(self, spider, reason):
32
+ """Called when spider is closed"""
33
+ identifier_column, identifier_value = self.get_identifier_info(spider)
34
+ message = f"{identifier_column.title()} {identifier_value} closed with reason: {reason}"
35
+ self._log_to_database(spider, "SPIDER_CLOSED", message)
36
+
37
+ def spider_error(self, failure, response, spider):
38
+ """Called when spider encounters an error"""
39
+ message = f"Spider error: {str(failure.value)} on {response.url if response else 'unknown URL'}"
40
+ self._log_to_database(spider, "SPIDER_ERROR", message)
41
+
42
+ def item_dropped(self, item, response, spider, exception):
43
+ """Called when an item is dropped"""
44
+ message = f"Item dropped: {str(exception)} from {response.url if response else 'unknown URL'}"
45
+ self._log_to_database(spider, "ITEM_DROPPED", message)
@@ -0,0 +1,2 @@
1
+ """Pipeline modules for scrapy_item_ingest."""
2
+
@@ -0,0 +1,50 @@
1
+ """
2
+ Base pipeline functionality for scrapy_item_ingest.
3
+ """
4
+ import logging
5
+ from ..config.settings import Settings, validate_settings
6
+ from ..database.connection import DatabaseConnection
7
+ from ..database.schema import SchemaManager
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class BasePipeline:
13
+ """Base pipeline with common functionality"""
14
+
15
+ def __init__(self, settings):
16
+ self.settings = settings
17
+ self.db = None
18
+ self.schema_manager = None
19
+ validate_settings(settings)
20
+
21
+ @classmethod
22
+ def from_crawler(cls, crawler):
23
+ """Create pipeline instance from crawler"""
24
+ settings = Settings(crawler.settings)
25
+ return cls(settings)
26
+
27
+ def open_spider(self, spider):
28
+ """Called when spider is opened"""
29
+ # Establish database connection
30
+ self.db = DatabaseConnection(self.settings.db_url)
31
+ if not self.db.connect():
32
+ raise Exception("Failed to connect to database")
33
+
34
+ # Initialize schema manager
35
+ self.schema_manager = SchemaManager(self.db, self.settings)
36
+
37
+ # Ensure tables exist
38
+ self.schema_manager.ensure_tables_exist()
39
+
40
+ logger.info(f"Pipeline opened for {self.settings.get_identifier_column()}: {self.settings.get_identifier_value(spider)}")
41
+
42
+ def close_spider(self, spider):
43
+ """Called when spider is closed"""
44
+ if self.db:
45
+ self.db.close()
46
+ logger.info(f"Pipeline closed for {self.settings.get_identifier_column()}: {self.settings.get_identifier_value(spider)}")
47
+
48
+ def get_identifier_info(self, spider):
49
+ """Get identifier column and value for the spider"""
50
+ return self.settings.get_identifier_column(), self.settings.get_identifier_value(spider)
@@ -0,0 +1,42 @@
1
+ """
2
+ Items pipeline for storing scraped items.
3
+ """
4
+ import logging
5
+ from datetime import datetime, timezone
6
+ from itemadapter import ItemAdapter
7
+ from scrapy.exceptions import DropItem
8
+ from .base import BasePipeline
9
+ from ..utils.serialization import serialize_item_data
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class ItemsPipeline(BasePipeline):
15
+ """Pipeline for handling scraped items"""
16
+
17
+ def process_item(self, item, spider):
18
+ """Process and store item in database"""
19
+ job_id = self.settings.get_identifier_value(spider)
20
+
21
+ logger.info(f"Processing item for job_id {job_id}: {item}")
22
+ adapter = ItemAdapter(item)
23
+ item_dict = adapter.asdict()
24
+ created_at = datetime.now(timezone.utc)
25
+
26
+ logger.info(f"Item dict prepared: {item_dict}")
27
+
28
+ # Store everything as JSON in the item column
29
+ try:
30
+ sql = f"INSERT INTO {self.settings.db_items_table} (job_id, item, created_at) VALUES (%s, %s, %s)"
31
+ json_data = serialize_item_data(item_dict)
32
+ logger.info(f"Executing SQL: {sql} with JSON data")
33
+
34
+ self.db.execute(sql, (job_id, json_data, created_at))
35
+ self.db.commit()
36
+ logger.info(f"Successfully inserted item for job_id {job_id}")
37
+ except Exception as e:
38
+ logger.error(f"Failed to insert item: {e}")
39
+ self.db.rollback()
40
+ raise DropItem(f"DB insert error: {e}")
41
+
42
+ return item
@@ -0,0 +1,41 @@
1
+ """
2
+ Main pipeline that combines items and requests functionality.
3
+ """
4
+ import logging
5
+ from .items import ItemsPipeline
6
+ from .requests import RequestsPipeline
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class DbInsertPipeline(ItemsPipeline, RequestsPipeline):
12
+ """
13
+ Main pipeline that combines item processing and request tracking.
14
+ Inherits from both ItemsPipeline and RequestsPipeline.
15
+ """
16
+
17
+ def __init__(self, settings):
18
+ # Initialize both parent classes
19
+ ItemsPipeline.__init__(self, settings)
20
+ RequestsPipeline.__init__(self, settings)
21
+
22
+ @classmethod
23
+ def from_crawler(cls, crawler):
24
+ """Create pipeline instance from crawler"""
25
+ # Use RequestsPipeline's from_crawler to get signal connections
26
+ return RequestsPipeline.from_crawler.__func__(cls, crawler)
27
+
28
+ def open_spider(self, spider):
29
+ """Called when spider is opened"""
30
+ # Use the base class implementation
31
+ super().open_spider(spider)
32
+
33
+ def close_spider(self, spider):
34
+ """Called when spider is closed"""
35
+ # Use the base class implementation
36
+ super().close_spider(spider)
37
+
38
+ def process_item(self, item, spider):
39
+ """Process and store item in database"""
40
+ # Use ItemsPipeline's process_item method
41
+ return ItemsPipeline.process_item(self, item, spider)
@@ -0,0 +1,169 @@
1
+ """
2
+ Requests pipeline for tracking request information.
3
+ """
4
+ import logging
5
+ import time
6
+ from datetime import datetime, timezone
7
+ from scrapy import signals
8
+ from .base import BasePipeline
9
+ from ..utils.fingerprint import get_request_fingerprint
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class RequestsPipeline(BasePipeline):
15
+ """Pipeline for handling request tracking"""
16
+
17
+ def __init__(self, settings):
18
+ super().__init__(settings)
19
+ self.request_start_times = {} # Track request start times
20
+ self.request_id_map = {} # Track fingerprint to database ID mapping
21
+ self.url_to_id_map = {} # Track URL to database ID mapping
22
+ self.current_response_url = None # Track current response being processed
23
+
24
+ @classmethod
25
+ def from_crawler(cls, crawler):
26
+ """Create pipeline instance from crawler"""
27
+ pipeline = super().from_crawler(crawler)
28
+ # Connect to request signals to automatically log requests
29
+ crawler.signals.connect(pipeline.request_scheduled, signal=signals.request_scheduled)
30
+ crawler.signals.connect(pipeline.response_received, signal=signals.response_received)
31
+ return pipeline
32
+
33
+ def _get_parent_request_info(self, request, spider):
34
+ """Extract parent request information if available"""
35
+ parent_id = None
36
+ parent_url = None
37
+
38
+ # Get job_id for the current spider
39
+ job_id = self.settings.get_identifier_value(spider)
40
+
41
+ try:
42
+ # Method 1: Use current response URL as parent (most reliable)
43
+ if self.current_response_url and self.current_response_url != request.url:
44
+ parent_url = self.current_response_url
45
+ if parent_url in self.url_to_id_map:
46
+ parent_id = self.url_to_id_map[parent_url]
47
+ logger.info(f"Found parent ID {parent_id} from current response URL: {parent_url}")
48
+
49
+ # Method 2: Check request meta for referer
50
+ if not parent_id and hasattr(request, 'meta') and request.meta:
51
+ if 'referer' in request.meta:
52
+ parent_url = request.meta['referer']
53
+ logger.info(f"Found referer in meta: {parent_url}")
54
+
55
+ # Look up in our URL mapping first (faster)
56
+ if parent_url in self.url_to_id_map:
57
+ parent_id = self.url_to_id_map[parent_url]
58
+ logger.info(f"Found parent ID {parent_id} from URL mapping")
59
+ else:
60
+ # Look up in database
61
+ try:
62
+ sql = f"SELECT id FROM {self.settings.db_requests_table} WHERE url = %s AND job_id = %s ORDER BY created_at DESC LIMIT 1"
63
+ result = self.db.execute(sql, (parent_url, job_id))
64
+ if result:
65
+ parent_id = result[0]
66
+ # Cache the result
67
+ self.url_to_id_map[parent_url] = parent_id
68
+ logger.info(f"Found parent ID {parent_id} from database lookup")
69
+ except Exception as e:
70
+ logger.warning(f"Could not look up parent ID by referer URL: {e}")
71
+
72
+ # Debug: Log request meta information
73
+ logger.debug(f"Request URL: {request.url}")
74
+ logger.debug(f"Request meta keys: {list(request.meta.keys()) if request.meta else 'None'}")
75
+ if 'depth' in request.meta:
76
+ logger.debug(f"Request depth: {request.meta['depth']}")
77
+
78
+ except Exception as e:
79
+ logger.warning(f"Could not extract parent request info: {e}")
80
+
81
+ # If we still don't have parent info, log for debugging
82
+ if not parent_id and not parent_url:
83
+ logger.debug(f"No parent found for request: {request.url}")
84
+
85
+ return parent_id, parent_url
86
+
87
+ def log_request(self, request, spider):
88
+ """Log request to database"""
89
+ job_id = self.settings.get_identifier_value(spider)
90
+
91
+ logger.info(f"Logging request for job_id {job_id}: {request.url}")
92
+ fingerprint = get_request_fingerprint(request)
93
+ parent_id, parent_url = self._get_parent_request_info(request, spider)
94
+ request_time = time.time()
95
+ created_at = datetime.now(timezone.utc)
96
+
97
+ # Store request start time for duration calculation
98
+ self.request_start_times[fingerprint] = request_time
99
+
100
+ sql = f"""
101
+ INSERT INTO {self.settings.db_requests_table}
102
+ (job_id, url, method, fingerprint, parent_id, parent_url, created_at)
103
+ VALUES (%s, %s, %s, %s, %s, %s, %s) RETURNING id
104
+ """
105
+ try:
106
+ result = self.db.execute(sql, (
107
+ job_id,
108
+ request.url,
109
+ request.method,
110
+ fingerprint,
111
+ parent_id,
112
+ parent_url,
113
+ created_at
114
+ ))
115
+
116
+ # Get the inserted record ID and store it for future parent lookups
117
+ if result:
118
+ record_id = result[0]
119
+ self.request_id_map[fingerprint] = record_id
120
+ self.url_to_id_map[request.url] = record_id # Store URL to ID mapping
121
+
122
+ self.db.commit()
123
+
124
+ log_msg = f"Successfully logged request for job_id {job_id} with fingerprint {fingerprint} (ID: {record_id})"
125
+ if parent_id:
126
+ log_msg += f" (parent ID: {parent_id}, parent URL: {parent_url})"
127
+ else:
128
+ log_msg += " (no parent found)"
129
+ logger.info(log_msg)
130
+ except Exception as e:
131
+ logger.error(f"Failed to log request: {e}")
132
+ self.db.rollback()
133
+
134
+ def request_scheduled(self, request, spider):
135
+ """Called when a request is scheduled"""
136
+ job_id = self.settings.get_identifier_value(spider)
137
+ logger.info(f"Request scheduled for job_id {job_id}: {request.url}")
138
+ self.log_request(request, spider)
139
+
140
+ def response_received(self, response, request, spider):
141
+ """Called when a response is received"""
142
+ job_id = self.settings.get_identifier_value(spider)
143
+
144
+ logger.info(f"Response received for job_id {job_id}: {response.url} (status: {response.status})")
145
+
146
+ # Set current response URL for parent tracking
147
+ self.current_response_url = response.url
148
+
149
+ fingerprint = get_request_fingerprint(request)
150
+ response_time = time.time()
151
+
152
+ # Update the request log with response info
153
+ try:
154
+ sql = f"""
155
+ UPDATE {self.settings.db_requests_table}
156
+ SET status_code = %s, response_time = %s
157
+ WHERE job_id = %s AND fingerprint = %s AND status_code IS NULL
158
+ """
159
+ self.db.execute(sql, (
160
+ response.status,
161
+ response_time,
162
+ job_id,
163
+ fingerprint
164
+ ))
165
+ self.db.commit()
166
+ logger.info(f"Updated request status {response.status} and response_time for fingerprint {fingerprint}")
167
+ except Exception as e:
168
+ logger.error(f"Failed to update request status: {e}")
169
+ self.db.rollback()
@@ -0,0 +1,2 @@
1
+ """Utility modules for scrapy_item_ingest."""
2
+
@@ -0,0 +1,25 @@
1
+ """
2
+ Request fingerprint utilities for generating unique request identifiers.
3
+ """
4
+ import hashlib
5
+ import logging
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def get_request_fingerprint(request):
11
+ """Generate fingerprint for the request"""
12
+ try:
13
+ from scrapy.utils.request import request_fingerprint
14
+ return request_fingerprint(request)
15
+ except Exception as e:
16
+ logger.warning(f"Could not generate fingerprint: {e}")
17
+ # Fallback fingerprint generation
18
+ fingerprint_data = f"{request.method}:{request.url}"
19
+ return hashlib.sha1(fingerprint_data.encode()).hexdigest()
20
+
21
+
22
+ def generate_url_fingerprint(method, url):
23
+ """Generate a simple fingerprint for URL and method combination"""
24
+ fingerprint_data = f"{method}:{url}"
25
+ return hashlib.sha1(fingerprint_data.encode()).hexdigest()
@@ -0,0 +1,28 @@
1
+ """
2
+ Serialization utilities for converting data to JSON-serializable format.
3
+ """
4
+ import json
5
+ from datetime import datetime, date, time
6
+ from decimal import Decimal
7
+
8
+
9
+ def serialize_stats(obj):
10
+ """Recursively convert stats to JSON-serializable format"""
11
+ if isinstance(obj, dict):
12
+ return {key: serialize_stats(value) for key, value in obj.items()}
13
+ elif isinstance(obj, (list, tuple)):
14
+ return [serialize_stats(item) for item in obj]
15
+ elif isinstance(obj, (datetime, date, time)):
16
+ return obj.isoformat()
17
+ elif isinstance(obj, Decimal):
18
+ return float(obj)
19
+ elif isinstance(obj, (int, float, str, bool)) or obj is None:
20
+ return obj
21
+ else:
22
+ # For any other type, convert to string
23
+ return str(obj)
24
+
25
+
26
+ def serialize_item_data(item_dict):
27
+ """Serialize item data to JSON string"""
28
+ return json.dumps(item_dict, ensure_ascii=False, default=str)
@@ -0,0 +1,132 @@
1
+ Metadata-Version: 2.4
2
+ Name: scrapy_item_ingest
3
+ Version: 0.1.0
4
+ Summary: Scrapy extension for database ingestion with job/spider tracking
5
+ Home-page: https://github.com/fawadss1/scrapy_item_ingest
6
+ Author: Fawad Ali
7
+ Author-email: fawadstar6@gmail.com
8
+ Project-URL: Documentation, https://scrapy-item-ingest.readthedocs.io/
9
+ Project-URL: Source, https://github.com/fawadss1/scrapy_item_ingest
10
+ Project-URL: Tracker, https://github.com/fawadss1/scrapy_item_ingest/issues
11
+ Keywords: scrapy,database,postgresql,web-scraping,data-pipeline
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.7
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Framework :: Scrapy
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Classifier: Topic :: Internet :: WWW/HTTP
25
+ Classifier: Topic :: Database
26
+ Requires-Python: >=3.7
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: scrapy>=2.5.0
30
+ Requires-Dist: psycopg2-binary>=2.8.0
31
+ Requires-Dist: itemadapter>=0.6.0
32
+ Requires-Dist: SQLAlchemy>=1.4.0
33
+ Provides-Extra: docs
34
+ Requires-Dist: sphinx>=5.0.0; extra == "docs"
35
+ Requires-Dist: sphinx_rtd_theme>=1.2.0; extra == "docs"
36
+ Requires-Dist: myst-parser>=0.18.0; extra == "docs"
37
+ Requires-Dist: sphinx-autodoc-typehints>=1.19.0; extra == "docs"
38
+ Requires-Dist: sphinx-copybutton>=0.5.0; extra == "docs"
39
+ Provides-Extra: dev
40
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
41
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
42
+ Requires-Dist: black>=22.0.0; extra == "dev"
43
+ Requires-Dist: flake8>=5.0.0; extra == "dev"
44
+ Requires-Dist: mypy>=0.991; extra == "dev"
45
+ Requires-Dist: pre-commit>=2.20.0; extra == "dev"
46
+ Provides-Extra: test
47
+ Requires-Dist: pytest>=7.0.0; extra == "test"
48
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
49
+ Requires-Dist: pytest-mock>=3.8.0; extra == "test"
50
+ Dynamic: author
51
+ Dynamic: author-email
52
+ Dynamic: classifier
53
+ Dynamic: description
54
+ Dynamic: description-content-type
55
+ Dynamic: home-page
56
+ Dynamic: keywords
57
+ Dynamic: license-file
58
+ Dynamic: project-url
59
+ Dynamic: provides-extra
60
+ Dynamic: requires-dist
61
+ Dynamic: requires-python
62
+ Dynamic: summary
63
+
64
+ # Scrapy Item Ingest
65
+
66
+ [![PyPI Version](https://img.shields.io/pypi/v/scrapy-item-ingest.svg)](https://pypi.org/project/scrapy-item-ingest/)
67
+ [![PyPI Downloads](https://img.shields.io/pypi/dm/scrapy-item-ingest.svg)](https://pypi.org/project/scrapy-item-ingest/)
68
+ [![Supported Python Versions](https://img.shields.io/pypi/pyversions/scrapy-item-ingest.svg)](https://pypi.org/project/scrapy-item-ingest/)
69
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
70
+
71
+ [![GitHub Stars](https://img.shields.io/github/stars/fawadss1/scrapy_item_ingest.svg)](https://github.com/fawadss1/scrapy_item_ingest/stargazers)
72
+ [![GitHub Issues](https://img.shields.io/github/issues/fawadss1/scrapy_item_ingest.svg)](https://github.com/fawadss1/scrapy_item_ingest/issues)
73
+ [![GitHub Last Commit](https://img.shields.io/github/last-commit/fawadss1/scrapy_item_ingest.svg)](https://github.com/fawadss1/scrapy_item_ingest/commits)
74
+
75
+ A comprehensive Scrapy extension for ingesting scraped items, requests, and logs into PostgreSQL databases with advanced tracking capabilities. This library provides a clean, production-ready solution for storing and monitoring your Scrapy crawling operations with real-time data ingestion and comprehensive logging.
76
+
77
+ ## Documentation
78
+
79
+ Full documentation is available at: [https://scrapy-item-ingest.readthedocs.io/en/latest/](https://scrapy-item-ingest.readthedocs.io/en/latest/)
80
+
81
+ ## Key Features
82
+
83
+ - 🔄 **Real-time Data Ingestion**: Store items, requests, and logs as they're processed
84
+ - 📊 **Request Tracking**: Track request response times, fingerprints, and parent-child relationships
85
+ - 🔍 **Comprehensive Logging**: Capture spider events, errors, and custom messages
86
+ - 🏗️ **Flexible Schema**: Support for both auto-creation and existing table modes
87
+ - ⚙️ **Modular Design**: Use individual components or the complete pipeline
88
+ - 🛡️ **Production Ready**: Handles both development and production scenarios
89
+ - 📝 **JSONB Storage**: Store complex item data as JSONB for flexible querying
90
+ - 🐳 **Docker Support**: Complete containerization with Docker and Kubernetes
91
+ - 📈 **Performance Optimized**: Connection pooling and batch processing
92
+ - 🔧 **Easy Configuration**: Environment-based configuration with validation
93
+ - 📊 **Monitoring Ready**: Built-in metrics and health checks
94
+
95
+ ## Installation
96
+
97
+ ```bash
98
+ pip install scrapy-item-ingest
99
+ ```
100
+
101
+ ## Development
102
+
103
+ ### Setting up for Development
104
+
105
+ ```bash
106
+ git clone https://github.com/fawadss1/scrapy_item_ingest.git
107
+ cd scrapy_item_ingest
108
+ pip install -e ".[dev]"
109
+ ```
110
+
111
+ ## License
112
+
113
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
114
+
115
+ ## Support
116
+
117
+ For support and questions:
118
+
119
+ - **Email**: fawadstar6@gmail.com
120
+ - **Documentation**: [https://scrapy-item-ingest.readthedocs.io/](https://scrapy-item-ingest.readthedocs.io/)
121
+ - **Issues**: Please report bugs and feature requests at [GitHub Issues](https://github.com/fawadss1/scrapy_item_ingest/issues)
122
+
123
+ ## Changelog
124
+
125
+ ### v0.1.0 (Current)
126
+
127
+ - Initial release
128
+ - Core pipeline functionality for items, requests, and logs
129
+ - PostgreSQL database integration with JSONB storage
130
+ - Comprehensive documentation and examples
131
+ - Production deployment guides
132
+ - Docker and Kubernetes support
@@ -0,0 +1,27 @@
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ scrapy_item_ingest/__init__.py
5
+ scrapy_item_ingest.egg-info/PKG-INFO
6
+ scrapy_item_ingest.egg-info/SOURCES.txt
7
+ scrapy_item_ingest.egg-info/dependency_links.txt
8
+ scrapy_item_ingest.egg-info/entry_points.txt
9
+ scrapy_item_ingest.egg-info/not-zip-safe
10
+ scrapy_item_ingest.egg-info/requires.txt
11
+ scrapy_item_ingest.egg-info/top_level.txt
12
+ scrapy_item_ingest/config/__init__.py
13
+ scrapy_item_ingest/config/settings.py
14
+ scrapy_item_ingest/database/__init__.py
15
+ scrapy_item_ingest/database/connection.py
16
+ scrapy_item_ingest/database/schema.py
17
+ scrapy_item_ingest/extensions/__init__.py
18
+ scrapy_item_ingest/extensions/base.py
19
+ scrapy_item_ingest/extensions/logging.py
20
+ scrapy_item_ingest/pipelines/__init__.py
21
+ scrapy_item_ingest/pipelines/base.py
22
+ scrapy_item_ingest/pipelines/items.py
23
+ scrapy_item_ingest/pipelines/main.py
24
+ scrapy_item_ingest/pipelines/requests.py
25
+ scrapy_item_ingest/utils/__init__.py
26
+ scrapy_item_ingest/utils/fingerprint.py
27
+ scrapy_item_ingest/utils/serialization.py
@@ -0,0 +1,5 @@
1
+ [scrapy.extensions]
2
+ logging_ext = scrapy_item_ingest.extensions.logging:LoggingExtension
3
+
4
+ [scrapy.pipelines]
5
+ db_ingest = scrapy_item_ingest.pipelines.main:DbInsertPipeline
@@ -0,0 +1,24 @@
1
+ scrapy>=2.5.0
2
+ psycopg2-binary>=2.8.0
3
+ itemadapter>=0.6.0
4
+ SQLAlchemy>=1.4.0
5
+
6
+ [dev]
7
+ pytest>=7.0.0
8
+ pytest-cov>=4.0.0
9
+ black>=22.0.0
10
+ flake8>=5.0.0
11
+ mypy>=0.991
12
+ pre-commit>=2.20.0
13
+
14
+ [docs]
15
+ sphinx>=5.0.0
16
+ sphinx_rtd_theme>=1.2.0
17
+ myst-parser>=0.18.0
18
+ sphinx-autodoc-typehints>=1.19.0
19
+ sphinx-copybutton>=0.5.0
20
+
21
+ [test]
22
+ pytest>=7.0.0
23
+ pytest-cov>=4.0.0
24
+ pytest-mock>=3.8.0
@@ -0,0 +1 @@
1
+ scrapy_item_ingest
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,81 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ # Read the README file for long description
4
+ try:
5
+ with open("README.md", "r", encoding="utf-8") as fh:
6
+ long_description = fh.read()
7
+ except FileNotFoundError:
8
+ long_description = "A comprehensive Scrapy extension for ingesting scraped items, requests, and logs into PostgreSQL databases."
9
+
10
+ setup(
11
+ name="scrapy_item_ingest",
12
+ version="0.1.0",
13
+ description="Scrapy extension for database ingestion with job/spider tracking",
14
+ long_description=long_description,
15
+ long_description_content_type="text/markdown",
16
+ author="Fawad Ali",
17
+ author_email="fawadstar6@gmail.com",
18
+ url="https://github.com/fawadss1/scrapy_item_ingest",
19
+ project_urls={
20
+ "Documentation": "https://scrapy-item-ingest.readthedocs.io/",
21
+ "Source": "https://github.com/fawadss1/scrapy_item_ingest",
22
+ "Tracker": "https://github.com/fawadss1/scrapy_item_ingest/issues",
23
+ },
24
+ packages=find_packages(),
25
+ classifiers=[
26
+ "Development Status :: 4 - Beta",
27
+ "Intended Audience :: Developers",
28
+ "License :: OSI Approved :: MIT License",
29
+ "Operating System :: OS Independent",
30
+ "Programming Language :: Python :: 3",
31
+ "Programming Language :: Python :: 3.7",
32
+ "Programming Language :: Python :: 3.8",
33
+ "Programming Language :: Python :: 3.9",
34
+ "Programming Language :: Python :: 3.10",
35
+ "Programming Language :: Python :: 3.11",
36
+ "Framework :: Scrapy",
37
+ "Topic :: Software Development :: Libraries :: Python Modules",
38
+ "Topic :: Internet :: WWW/HTTP",
39
+ "Topic :: Database",
40
+ ],
41
+ keywords="scrapy, database, postgresql, web-scraping, data-pipeline",
42
+ install_requires=[
43
+ "scrapy>=2.5.0",
44
+ "psycopg2-binary>=2.8.0",
45
+ "itemadapter>=0.6.0",
46
+ "SQLAlchemy>=1.4.0",
47
+ ],
48
+ extras_require={
49
+ "docs": [
50
+ "sphinx>=5.0.0",
51
+ "sphinx_rtd_theme>=1.2.0",
52
+ "myst-parser>=0.18.0",
53
+ "sphinx-autodoc-typehints>=1.19.0",
54
+ "sphinx-copybutton>=0.5.0",
55
+ ],
56
+ "dev": [
57
+ "pytest>=7.0.0",
58
+ "pytest-cov>=4.0.0",
59
+ "black>=22.0.0",
60
+ "flake8>=5.0.0",
61
+ "mypy>=0.991",
62
+ "pre-commit>=2.20.0",
63
+ ],
64
+ "test": [
65
+ "pytest>=7.0.0",
66
+ "pytest-cov>=4.0.0",
67
+ "pytest-mock>=3.8.0",
68
+ ],
69
+ },
70
+ entry_points={
71
+ "scrapy.pipelines": [
72
+ "db_ingest = scrapy_item_ingest.pipelines.main:DbInsertPipeline"
73
+ ],
74
+ "scrapy.extensions": [
75
+ "logging_ext = scrapy_item_ingest.extensions.logging:LoggingExtension"
76
+ ],
77
+ },
78
+ python_requires=">=3.7",
79
+ include_package_data=True,
80
+ zip_safe=False,
81
+ )