crawlit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlit-0.1.0/LICENSE +21 -0
- crawlit-0.1.0/MANIFEST.in +24 -0
- crawlit-0.1.0/PKG-INFO +205 -0
- crawlit-0.1.0/README.md +177 -0
- crawlit-0.1.0/RELEASE_NOTES.md +50 -0
- crawlit-0.1.0/crawlit/__init__.py +25 -0
- crawlit-0.1.0/crawlit/crawler/__init__.py +15 -0
- crawlit-0.1.0/crawlit/crawler/engine.py +201 -0
- crawlit-0.1.0/crawlit/crawler/fetcher.py +92 -0
- crawlit-0.1.0/crawlit/crawler/parser.py +87 -0
- crawlit-0.1.0/crawlit/crawler/robots.py +117 -0
- crawlit-0.1.0/crawlit/crawlit.py +107 -0
- crawlit-0.1.0/crawlit/output/__init__.py +20 -0
- crawlit-0.1.0/crawlit/output/formatters.py +363 -0
- crawlit-0.1.0/crawlit.egg-info/SOURCES.txt +25 -0
- crawlit-0.1.0/examples/programmatic_results.csv +16 -0
- crawlit-0.1.0/examples/programmatic_results.json +682 -0
- crawlit-0.1.0/examples/programmatic_usage.py +99 -0
- crawlit-0.1.0/pyproject.toml +45 -0
- crawlit-0.1.0/requirements.txt +3 -0
- crawlit-0.1.0/setup.cfg +4 -0
- crawlit-0.1.0/setup.py +25 -0
- crawlit-0.1.0/tests/__init__.py +5 -0
- crawlit-0.1.0/tests/conftest.py +36 -0
- crawlit-0.1.0/tests/test_cli.py +256 -0
- crawlit-0.1.0/tests/test_engine.py +303 -0
- crawlit-0.1.0/tests/test_integration.py +273 -0
- crawlit-0.1.0/tests/test_library.py +367 -0
crawlit-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 The crawlit Authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
include README.md
|
|
2
|
+
include requirements.txt
|
|
3
|
+
include pyproject.toml
|
|
4
|
+
include LICENSE
|
|
5
|
+
|
|
6
|
+
# Include documentation
|
|
7
|
+
include *.md
|
|
8
|
+
|
|
9
|
+
# Include examples
|
|
10
|
+
recursive-include examples *
|
|
11
|
+
|
|
12
|
+
# Include core modules
|
|
13
|
+
recursive-include crawlit/crawler *
|
|
14
|
+
recursive-include crawlit/output *
|
|
15
|
+
|
|
16
|
+
# Include tests
|
|
17
|
+
recursive-include tests *
|
|
18
|
+
|
|
19
|
+
# Include any data files
|
|
20
|
+
# If you have a data directory, uncomment this line
|
|
21
|
+
# recursive-include crawlit/data *
|
|
22
|
+
|
|
23
|
+
# Global excludes for common temporary/generated files
|
|
24
|
+
global-exclude *.egg-info/*
|
crawlit-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawlit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Modular, Ethical Python Web Crawler
|
|
5
|
+
Author-email: Swayam Dani <swayamdani@swayamdani.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/SwayamDani/crawlit
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/SwayamDani/crawlit/issues
|
|
9
|
+
Project-URL: Documentation, https://github.com/SwayamDani/crawlit/tree/main/docs
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Development Status :: 4 - Beta
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Requires-Python: >=3.8
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: requests>=2.25.0
|
|
23
|
+
Requires-Dist: beautifulsoup4>=4.9.0
|
|
24
|
+
Requires-Dist: lxml>=4.6.0
|
|
25
|
+
Provides-Extra: cli
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
Dynamic: requires-python
|
|
28
|
+
|
|
29
|
+
# 🕷️ crawlit - Modular, Ethical Python Web Crawler
|
|
30
|
+
|
|
31
|
+
[](https://opensource.org/licenses/MIT)
|
|
32
|
+
[](https://www.python.org/downloads/)
|
|
33
|
+
|
|
34
|
+
A powerful, modular, and ethical web crawler built in Python. Designed for security testing, link extraction, and website structure mapping with a focus on clean architecture and extensibility.
|
|
35
|
+
|
|
36
|
+
## 🚀 Features
|
|
37
|
+
|
|
38
|
+
- **Modular Architecture**: Easily extend with custom modules and parsers
|
|
39
|
+
- **Ethical Crawling**: Configurable robots.txt compliance and rate limiting
|
|
40
|
+
- **Depth Control**: Set maximum crawl depth to prevent excessive resource usage
|
|
41
|
+
- **Domain Filtering**: Restrict crawling to specific domains or subdomains
|
|
42
|
+
- **Robust Error Handling**: Gracefully manage connection issues and malformed pages
|
|
43
|
+
- **Multiple Output Formats**: Export results as JSON, CSV, or plain text
|
|
44
|
+
- **Detailed Logging**: Comprehensive logging of all crawler activities
|
|
45
|
+
- **Command Line Interface**: Simple, powerful CLI for easy usage
|
|
46
|
+
- **Programmatic API**: Use as a library in your own Python code
|
|
47
|
+
|
|
48
|
+
## 📋 Requirements
|
|
49
|
+
|
|
50
|
+
- Python 3.8+
|
|
51
|
+
- Dependencies (will be listed in requirements.txt)
|
|
52
|
+
|
|
53
|
+
## 🛠️ Installation
|
|
54
|
+
|
|
55
|
+
### From PyPI (recommended)
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
# Install the core library
|
|
59
|
+
pip install crawlit
|
|
60
|
+
|
|
61
|
+
# Install with CLI tool support
|
|
62
|
+
pip install crawlit[cli]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### From Source
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# Clone the repository
|
|
69
|
+
git clone https://github.com/SwayamDani/crawlit.git
|
|
70
|
+
cd crawlit
|
|
71
|
+
|
|
72
|
+
# Create a virtual environment (recommended)
|
|
73
|
+
python -m venv venv
|
|
74
|
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
|
75
|
+
|
|
76
|
+
# Install dependencies
|
|
77
|
+
pip install -r requirements.txt
|
|
78
|
+
|
|
79
|
+
# Install in development mode
|
|
80
|
+
pip install -e .
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## 📘 Usage
|
|
84
|
+
|
|
85
|
+
### API Documentation
|
|
86
|
+
|
|
87
|
+
Full API documentation is available in the `docs` directory. To build and view the documentation:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
# Install Sphinx and required packages
|
|
91
|
+
pip install sphinx sphinx_rtd_theme sphinxcontrib-napoleon
|
|
92
|
+
|
|
93
|
+
# Build the documentation
|
|
94
|
+
cd docs
|
|
95
|
+
make html # On Windows: make.bat html
|
|
96
|
+
|
|
97
|
+
# View the documentation
|
|
98
|
+
# Open docs/_build/html/index.html in your browser
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### As a Library in Your Python Code
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from crawlit import Crawler, save_results, generate_summary_report
|
|
105
|
+
|
|
106
|
+
# Initialize the crawler with custom parameters
|
|
107
|
+
crawler = Crawler(
|
|
108
|
+
start_url="https://example.com",
|
|
109
|
+
max_depth=3,
|
|
110
|
+
internal_only=True,
|
|
111
|
+
user_agent="MyCustomBot/1.0",
|
|
112
|
+
delay=0.5,
|
|
113
|
+
respect_robots=True
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Start crawling
|
|
117
|
+
crawler.crawl()
|
|
118
|
+
|
|
119
|
+
# Get and process results
|
|
120
|
+
results = crawler.get_results()
|
|
121
|
+
print(f"Crawled {len(results)} URLs")
|
|
122
|
+
|
|
123
|
+
# Save results in different formats
|
|
124
|
+
save_results(results, "json", "crawl_results.json", pretty=True)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
See the `examples/programmatic_usage.py` file for a complete example.
|
|
128
|
+
|
|
129
|
+
### Command Line Interface
|
|
130
|
+
|
|
131
|
+
If you installed with `pip install crawlit[cli]`, you can use the command-line interface:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
# Basic usage
|
|
135
|
+
crawlit --url https://example.com
|
|
136
|
+
|
|
137
|
+
# Advanced options
|
|
138
|
+
crawlit --url https://example.com \
|
|
139
|
+
--depth 3 \
|
|
140
|
+
--output-format json \
|
|
141
|
+
--output results.json \
|
|
142
|
+
--delay 0.5 \
|
|
143
|
+
--user-agent "crawlit/1.0" \
|
|
144
|
+
--ignore-robots
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Command Line Arguments
|
|
148
|
+
|
|
149
|
+
| Argument | Description | Default |
|
|
150
|
+
|----------|-------------|---------|
|
|
151
|
+
| `--url`, `-u` | Target website URL | Required |
|
|
152
|
+
| `--depth`, `-d` | Maximum crawl depth | 3 |
|
|
153
|
+
| `--output-format`, `-f` | Output format (json, csv, txt, html) | json |
|
|
154
|
+
| `--output`, `-O` | File to save results | crawl_results.json |
|
|
155
|
+
| `--pretty-json`, `-p` | Enable pretty-print JSON with indentation | False |
|
|
156
|
+
| `--ignore-robots`, `-i` | Ignore robots.txt rules | False |
|
|
157
|
+
| `--delay` | Delay between requests (seconds) | 0.1 |
|
|
158
|
+
| `--user-agent`, `-a` | Custom User-Agent string | crawlit/1.0 |
|
|
159
|
+
| `--allow-external`, `-e` | Allow crawling URLs outside initial domain | False |
|
|
160
|
+
| `--summary`, `-s` | Show a summary of crawl results | False |
|
|
161
|
+
| `--verbose`, `-v` | Verbose output | False |
|
|
162
|
+
| `--help`, `-h` | Show help message | - |
|
|
163
|
+
|
|
164
|
+
## 🏗️ Project Structure
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
crawlit/
|
|
168
|
+
├── crawlit.py # CLI entry point
|
|
169
|
+
├── requirements.txt # Project dependencies
|
|
170
|
+
├── crawler/ # Core crawler modules
|
|
171
|
+
│ ├── __init__.py
|
|
172
|
+
│ ├── engine.py # Core crawler logic
|
|
173
|
+
│ ├── fetcher.py # HTTP request handling
|
|
174
|
+
│ ├── parser.py # HTML parsing and link extraction
|
|
175
|
+
│ └── robots.py # Robots.txt parser
|
|
176
|
+
├── output/ # Output formatters
|
|
177
|
+
│ ├── __init__.py
|
|
178
|
+
│ └── formatters.py # Output formatting functions
|
|
179
|
+
├── examples/ # Example usage
|
|
180
|
+
│ └── programmatic_usage.py # Example of using as a library
|
|
181
|
+
└── tests/ # Unit and integration tests
|
|
182
|
+
└── __init__.py
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## 📅 Project Timeline
|
|
186
|
+
|
|
187
|
+
- **May 2025**: Initial structure and CLI setup
|
|
188
|
+
- **June 2025**: Core functionality complete (HTTP handling, parsing, domain control)
|
|
189
|
+
- **June 30, 2025**: Project completion target with all core features
|
|
190
|
+
|
|
191
|
+
## 🤝 Contributing
|
|
192
|
+
|
|
193
|
+
Contributions will be welcome after the core functionality is complete. Please check back after June 30, 2025, for contribution guidelines.
|
|
194
|
+
|
|
195
|
+
## 📜 License
|
|
196
|
+
|
|
197
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
198
|
+
|
|
199
|
+
## 👤 Author
|
|
200
|
+
|
|
201
|
+
Built and maintained by Swayam Dani
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
**Note**: This project is under active development with completion targeted for June 30, 2025.
|
crawlit-0.1.0/README.md
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# 🕷️ crawlit - Modular, Ethical Python Web Crawler
|
|
2
|
+
|
|
3
|
+
[](https://opensource.org/licenses/MIT)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
|
|
6
|
+
A powerful, modular, and ethical web crawler built in Python. Designed for security testing, link extraction, and website structure mapping with a focus on clean architecture and extensibility.
|
|
7
|
+
|
|
8
|
+
## 🚀 Features
|
|
9
|
+
|
|
10
|
+
- **Modular Architecture**: Easily extend with custom modules and parsers
|
|
11
|
+
- **Ethical Crawling**: Configurable robots.txt compliance and rate limiting
|
|
12
|
+
- **Depth Control**: Set maximum crawl depth to prevent excessive resource usage
|
|
13
|
+
- **Domain Filtering**: Restrict crawling to specific domains or subdomains
|
|
14
|
+
- **Robust Error Handling**: Gracefully manage connection issues and malformed pages
|
|
15
|
+
- **Multiple Output Formats**: Export results as JSON, CSV, or plain text
|
|
16
|
+
- **Detailed Logging**: Comprehensive logging of all crawler activities
|
|
17
|
+
- **Command Line Interface**: Simple, powerful CLI for easy usage
|
|
18
|
+
- **Programmatic API**: Use as a library in your own Python code
|
|
19
|
+
|
|
20
|
+
## 📋 Requirements
|
|
21
|
+
|
|
22
|
+
- Python 3.8+
|
|
23
|
+
- Dependencies (will be listed in requirements.txt)
|
|
24
|
+
|
|
25
|
+
## 🛠️ Installation
|
|
26
|
+
|
|
27
|
+
### From PyPI (recommended)
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
# Install the core library
|
|
31
|
+
pip install crawlit
|
|
32
|
+
|
|
33
|
+
# Install with CLI tool support
|
|
34
|
+
pip install crawlit[cli]
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### From Source
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
# Clone the repository
|
|
41
|
+
git clone https://github.com/SwayamDani/crawlit.git
|
|
42
|
+
cd crawlit
|
|
43
|
+
|
|
44
|
+
# Create a virtual environment (recommended)
|
|
45
|
+
python -m venv venv
|
|
46
|
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
|
47
|
+
|
|
48
|
+
# Install dependencies
|
|
49
|
+
pip install -r requirements.txt
|
|
50
|
+
|
|
51
|
+
# Install in development mode
|
|
52
|
+
pip install -e .
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## 📘 Usage
|
|
56
|
+
|
|
57
|
+
### API Documentation
|
|
58
|
+
|
|
59
|
+
Full API documentation is available in the `docs` directory. To build and view the documentation:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Install Sphinx and required packages
|
|
63
|
+
pip install sphinx sphinx_rtd_theme sphinxcontrib-napoleon
|
|
64
|
+
|
|
65
|
+
# Build the documentation
|
|
66
|
+
cd docs
|
|
67
|
+
make html # On Windows: make.bat html
|
|
68
|
+
|
|
69
|
+
# View the documentation
|
|
70
|
+
# Open docs/_build/html/index.html in your browser
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### As a Library in Your Python Code
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from crawlit import Crawler, save_results, generate_summary_report
|
|
77
|
+
|
|
78
|
+
# Initialize the crawler with custom parameters
|
|
79
|
+
crawler = Crawler(
|
|
80
|
+
start_url="https://example.com",
|
|
81
|
+
max_depth=3,
|
|
82
|
+
internal_only=True,
|
|
83
|
+
user_agent="MyCustomBot/1.0",
|
|
84
|
+
delay=0.5,
|
|
85
|
+
respect_robots=True
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Start crawling
|
|
89
|
+
crawler.crawl()
|
|
90
|
+
|
|
91
|
+
# Get and process results
|
|
92
|
+
results = crawler.get_results()
|
|
93
|
+
print(f"Crawled {len(results)} URLs")
|
|
94
|
+
|
|
95
|
+
# Save results in different formats
|
|
96
|
+
save_results(results, "json", "crawl_results.json", pretty=True)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
See the `examples/programmatic_usage.py` file for a complete example.
|
|
100
|
+
|
|
101
|
+
### Command Line Interface
|
|
102
|
+
|
|
103
|
+
If you installed with `pip install crawlit[cli]`, you can use the command-line interface:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
# Basic usage
|
|
107
|
+
crawlit --url https://example.com
|
|
108
|
+
|
|
109
|
+
# Advanced options
|
|
110
|
+
crawlit --url https://example.com \
|
|
111
|
+
--depth 3 \
|
|
112
|
+
--output-format json \
|
|
113
|
+
--output results.json \
|
|
114
|
+
--delay 0.5 \
|
|
115
|
+
--user-agent "crawlit/1.0" \
|
|
116
|
+
--ignore-robots
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Command Line Arguments
|
|
120
|
+
|
|
121
|
+
| Argument | Description | Default |
|
|
122
|
+
|----------|-------------|---------|
|
|
123
|
+
| `--url`, `-u` | Target website URL | Required |
|
|
124
|
+
| `--depth`, `-d` | Maximum crawl depth | 3 |
|
|
125
|
+
| `--output-format`, `-f` | Output format (json, csv, txt, html) | json |
|
|
126
|
+
| `--output`, `-O` | File to save results | crawl_results.json |
|
|
127
|
+
| `--pretty-json`, `-p` | Enable pretty-print JSON with indentation | False |
|
|
128
|
+
| `--ignore-robots`, `-i` | Ignore robots.txt rules | False |
|
|
129
|
+
| `--delay` | Delay between requests (seconds) | 0.1 |
|
|
130
|
+
| `--user-agent`, `-a` | Custom User-Agent string | crawlit/1.0 |
|
|
131
|
+
| `--allow-external`, `-e` | Allow crawling URLs outside initial domain | False |
|
|
132
|
+
| `--summary`, `-s` | Show a summary of crawl results | False |
|
|
133
|
+
| `--verbose`, `-v` | Verbose output | False |
|
|
134
|
+
| `--help`, `-h` | Show help message | - |
|
|
135
|
+
|
|
136
|
+
## 🏗️ Project Structure
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
crawlit/
|
|
140
|
+
├── crawlit.py # CLI entry point
|
|
141
|
+
├── requirements.txt # Project dependencies
|
|
142
|
+
├── crawler/ # Core crawler modules
|
|
143
|
+
│ ├── __init__.py
|
|
144
|
+
│ ├── engine.py # Core crawler logic
|
|
145
|
+
│ ├── fetcher.py # HTTP request handling
|
|
146
|
+
│ ├── parser.py # HTML parsing and link extraction
|
|
147
|
+
│ └── robots.py # Robots.txt parser
|
|
148
|
+
├── output/ # Output formatters
|
|
149
|
+
│ ├── __init__.py
|
|
150
|
+
│ └── formatters.py # Output formatting functions
|
|
151
|
+
├── examples/ # Example usage
|
|
152
|
+
│ └── programmatic_usage.py # Example of using as a library
|
|
153
|
+
└── tests/ # Unit and integration tests
|
|
154
|
+
└── __init__.py
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## 📅 Project Timeline
|
|
158
|
+
|
|
159
|
+
- **May 2025**: Initial structure and CLI setup
|
|
160
|
+
- **June 2025**: Core functionality complete (HTTP handling, parsing, domain control)
|
|
161
|
+
- **June 30, 2025**: Project completion target with all core features
|
|
162
|
+
|
|
163
|
+
## 🤝 Contributing
|
|
164
|
+
|
|
165
|
+
Contributions will be welcome after the core functionality is complete. Please check back after June 30, 2025, for contribution guidelines.
|
|
166
|
+
|
|
167
|
+
## 📜 License
|
|
168
|
+
|
|
169
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
170
|
+
|
|
171
|
+
## 👤 Author
|
|
172
|
+
|
|
173
|
+
Built and maintained by Swayam Dani
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
**Note**: This project is under active development with completion targeted for June 30, 2025.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Crawlit 0.1.0 - Release Notes
|
|
2
|
+
|
|
3
|
+
We are pleased to announce the first public release of Crawlit - a modular, ethical Python web crawler.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Modular Architecture**: Easily extend with custom modules and parsers
|
|
8
|
+
- **Ethical Crawling**: Configurable robots.txt compliance and rate limiting
|
|
9
|
+
- **Depth Control**: Set maximum crawl depth to prevent excessive resource usage
|
|
10
|
+
- **Domain Filtering**: Restrict crawling to specific domains or subdomains
|
|
11
|
+
- **Robust Error Handling**: Gracefully manage connection issues and malformed pages
|
|
12
|
+
- **Multiple Output Formats**: Export results as JSON, CSV, or plain text
|
|
13
|
+
- **Detailed Logging**: Comprehensive logging of all crawler activities
|
|
14
|
+
- **Command Line Interface**: Simple, powerful CLI for easy usage
|
|
15
|
+
- **Programmatic API**: Use as a library in your own Python code
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
# Install the core library
|
|
21
|
+
pip install crawlit
|
|
22
|
+
|
|
23
|
+
# Install with CLI tool support
|
|
24
|
+
pip install crawlit[cli]
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Documentation
|
|
28
|
+
|
|
29
|
+
Comprehensive API documentation is now available in the `docs` directory. To build and view the documentation:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
# Install Sphinx and required packages
|
|
33
|
+
pip install sphinx sphinx_rtd_theme sphinxcontrib-napoleon
|
|
34
|
+
|
|
35
|
+
# Build the documentation
|
|
36
|
+
cd docs
|
|
37
|
+
make html # On Windows: make.bat html
|
|
38
|
+
|
|
39
|
+
# View the documentation
|
|
40
|
+
# Open docs/_build/html/index.html in your browser
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Known Issues
|
|
44
|
+
|
|
45
|
+
- Limited support for JavaScript-rendered content
|
|
46
|
+
- No advanced request throttling based on domain
|
|
47
|
+
|
|
48
|
+
## Acknowledgments
|
|
49
|
+
|
|
50
|
+
Thanks to all the early testers and contributors who helped make this release possible.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
crawlit - Modular, Ethical Python Web Crawler
|
|
4
|
+
|
|
5
|
+
A flexible web crawler library that can be used programmatically or via CLI.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = '0.1.0'
|
|
9
|
+
|
|
10
|
+
# Export core functionality
|
|
11
|
+
from crawlit.crawler.engine import Crawler
|
|
12
|
+
from crawlit.output.formatters import save_results, generate_summary_report
|
|
13
|
+
|
|
14
|
+
# CLI functionality (but not executed on import)
|
|
15
|
+
def cli_main():
|
|
16
|
+
"""Entry point for the CLI interface when installed with [cli] option"""
|
|
17
|
+
from crawlit.crawlit import main
|
|
18
|
+
return main()
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
'Crawler', # Main crawler engine
|
|
22
|
+
'save_results', # Output formatters
|
|
23
|
+
'generate_summary_report',
|
|
24
|
+
'cli_main' # CLI entry point
|
|
25
|
+
]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Crawler package containing the core modules for the crawlit web crawler
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .engine import Crawler
|
|
7
|
+
from .fetcher import fetch_page
|
|
8
|
+
from .parser import extract_links, _process_url
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
'Crawler',
|
|
12
|
+
'fetch_page',
|
|
13
|
+
'extract_links',
|
|
14
|
+
'_process_url'
|
|
15
|
+
]
|