protor 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protor-2.0.0/LICENSE +21 -0
- protor-2.0.0/PKG-INFO +297 -0
- protor-2.0.0/README.md +240 -0
- protor-2.0.0/protor/__init__.py +19 -0
- protor-2.0.0/protor/analyzer.py +293 -0
- protor-2.0.0/protor/cli.py +196 -0
- protor-2.0.0/protor/config.py +51 -0
- protor-2.0.0/protor/crawler.py +179 -0
- protor-2.0.0/protor/exceptions.py +46 -0
- protor-2.0.0/protor/models.py +64 -0
- protor-2.0.0/protor/scraper.py +390 -0
- protor-2.0.0/protor/theme.py +50 -0
- protor-2.0.0/protor/utils.py +48 -0
- protor-2.0.0/protor.egg-info/PKG-INFO +297 -0
- protor-2.0.0/protor.egg-info/SOURCES.txt +26 -0
- protor-2.0.0/protor.egg-info/dependency_links.txt +1 -0
- protor-2.0.0/protor.egg-info/entry_points.txt +2 -0
- protor-2.0.0/protor.egg-info/requires.txt +13 -0
- protor-2.0.0/protor.egg-info/top_level.txt +1 -0
- protor-2.0.0/pyproject.toml +126 -0
- protor-2.0.0/setup.cfg +4 -0
- protor-2.0.0/tests/test_analyzer.py +148 -0
- protor-2.0.0/tests/test_cli.py +129 -0
- protor-2.0.0/tests/test_crawler.py +191 -0
- protor-2.0.0/tests/test_integration.py +104 -0
- protor-2.0.0/tests/test_models.py +66 -0
- protor-2.0.0/tests/test_scraper.py +118 -0
- protor-2.0.0/tests/test_utils.py +106 -0
protor-2.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Pulkit
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
protor-2.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: protor
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: AI-powered web scraper and analyzer — async, offline-friendly, Ollama-backed
|
|
5
|
+
Author-email: Pulkit <work.pulkitpareek@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 Pulkit
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
Project-URL: Homepage, https://wtfpulkit.xyz
|
|
28
|
+
Project-URL: Repository, https://github.com/wtfpulkit/protor
|
|
29
|
+
Project-URL: Issues, https://github.com/wtfpulkit/protor/issues
|
|
30
|
+
Project-URL: Changelog, https://github.com/wtfpulkit/protor/blob/main/CHANGELOG.md
|
|
31
|
+
Keywords: web-scraper,cli,ollama,ai,crawler
|
|
32
|
+
Classifier: Development Status :: 4 - Beta
|
|
33
|
+
Classifier: Environment :: Console
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
39
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
40
|
+
Classifier: Topic :: Utilities
|
|
41
|
+
Requires-Python: >=3.11
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
License-File: LICENSE
|
|
44
|
+
Requires-Dist: aiohttp>=3.9
|
|
45
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
46
|
+
Requires-Dist: lxml>=5.0
|
|
47
|
+
Requires-Dist: rich>=13.7
|
|
48
|
+
Provides-Extra: dev
|
|
49
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
50
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
51
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
52
|
+
Requires-Dist: aioresponses>=0.7; extra == "dev"
|
|
53
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
54
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
55
|
+
Requires-Dist: pre-commit>=3.7; extra == "dev"
|
|
56
|
+
Dynamic: license-file
|
|
57
|
+
|
|
58
|
+
# protor
|
|
59
|
+
|
|
60
|
+
> scrape websites. analyze with ai. no bs.
|
|
61
|
+
|
|
62
|
+
a cli tool that actually works. scrapes web content with curl, feeds it to your local ollama models, gets insights. that's it.
|
|
63
|
+
|
|
64
|
+
## why this exists
|
|
65
|
+
|
|
66
|
+
because paying for web scraping apis is kinda mid when you can just use curl and a local llm. also because sometimes you need to analyze a bunch of sites and doing it manually is literally painful.
|
|
67
|
+
|
|
68
|
+
## what you need
|
|
69
|
+
|
|
70
|
+
- python 3.8+ (obviously)
|
|
71
|
+
- curl (you probably have it)
|
|
72
|
+
- [ollama](https://ollama.ai) running locally
|
|
73
|
+
|
|
74
|
+
### get ollama set up
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
# grab some models
|
|
78
|
+
ollama pull llama3
|
|
79
|
+
ollama pull mistral
|
|
80
|
+
ollama pull codellama
|
|
81
|
+
|
|
82
|
+
# start the server
|
|
83
|
+
ollama serve
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## install
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# clone this
|
|
90
|
+
git clone <your-repo-url>
|
|
91
|
+
cd basic-llm-web-scraper
|
|
92
|
+
|
|
93
|
+
# install it
|
|
94
|
+
pip install -e .
|
|
95
|
+
|
|
96
|
+
# or just
|
|
97
|
+
pip install -r requirements.txt
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## how to use
|
|
101
|
+
|
|
102
|
+
### see what models you have
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
protor models
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### scrape stuff
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
# one site
|
|
112
|
+
protor scrape https://example.com
|
|
113
|
+
|
|
114
|
+
# multiple sites
|
|
115
|
+
protor scrape https://example.com https://another-site.com
|
|
116
|
+
|
|
117
|
+
# skip the js files if you want
|
|
118
|
+
protor scrape https://example.com --no-js
|
|
119
|
+
|
|
120
|
+
# custom settings
|
|
121
|
+
protor scrape https://example.com --output my_data --timeout 60
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### analyze what you scraped
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
# general vibes check
|
|
128
|
+
protor analyze
|
|
129
|
+
|
|
130
|
+
# tech stack deep dive
|
|
131
|
+
protor analyze --focus technical --model codellama
|
|
132
|
+
|
|
133
|
+
# seo audit
|
|
134
|
+
protor analyze --focus seo --model mistral
|
|
135
|
+
|
|
136
|
+
# content analysis
|
|
137
|
+
protor analyze --focus content
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### do both at once (recommended)
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
# basic usage
|
|
144
|
+
protor run https://example.com
|
|
145
|
+
|
|
146
|
+
# with options
|
|
147
|
+
protor run https://example.com https://another.com --model llama3 --focus technical
|
|
148
|
+
|
|
149
|
+
# go crazy
|
|
150
|
+
protor run https://site1.com https://site2.com https://site3.com \
|
|
151
|
+
--model mistral \
|
|
152
|
+
--focus seo \
|
|
153
|
+
--no-js
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## what the focus modes do
|
|
157
|
+
|
|
158
|
+
- **general** - overall content, main themes, what the site's about
|
|
159
|
+
- **technical** - frameworks, tech stack, how it's built
|
|
160
|
+
- **content** - writing quality, structure, how readable it is
|
|
161
|
+
- **seo** - meta tags, optimization stuff, what needs fixing
|
|
162
|
+
|
|
163
|
+
## what you get
|
|
164
|
+
|
|
165
|
+
### after scraping
|
|
166
|
+
|
|
167
|
+
```
|
|
168
|
+
data/
|
|
169
|
+
├── example_com/
|
|
170
|
+
│ ├── index.html # the actual html
|
|
171
|
+
│ ├── manifest.json # metadata and stuff
|
|
172
|
+
│ └── js/ # javascript files
|
|
173
|
+
└── sites_index.json # summary of everything
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### after analysis
|
|
177
|
+
|
|
178
|
+
```
|
|
179
|
+
analysis/
|
|
180
|
+
├── README.md # readable report
|
|
181
|
+
└── analysis.json # raw data
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## real examples
|
|
185
|
+
|
|
186
|
+
### quick content check
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
protor run https://blog.example.com --focus content
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### technical audit
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
# grab everything including js
|
|
196
|
+
protor scrape https://webapp.example.com
|
|
197
|
+
|
|
198
|
+
# analyze the tech
|
|
199
|
+
protor analyze --focus technical --model codellama
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### competitor research
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
# scrape competitors
|
|
206
|
+
protor scrape https://competitor1.com https://competitor2.com https://competitor3.com
|
|
207
|
+
|
|
208
|
+
# get seo insights
|
|
209
|
+
protor analyze --focus seo --model mistral
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### batch analysis
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
protor run \
|
|
216
|
+
https://source1.com \
|
|
217
|
+
https://source2.com \
|
|
218
|
+
https://source3.com \
|
|
219
|
+
--model llama3
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## when stuff breaks
|
|
223
|
+
|
|
224
|
+
### ollama issues
|
|
225
|
+
|
|
226
|
+
```bash
|
|
227
|
+
# make sure it's running
|
|
228
|
+
ollama serve
|
|
229
|
+
|
|
230
|
+
# check your models
|
|
231
|
+
ollama list
|
|
232
|
+
|
|
233
|
+
# pull a model if needed
|
|
234
|
+
ollama pull llama3
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### curl failing
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
# test manually
|
|
241
|
+
curl -sL https://example.com
|
|
242
|
+
|
|
243
|
+
# try longer timeout
|
|
244
|
+
protor scrape https://example.com --timeout 120
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### analysis taking forever
|
|
248
|
+
|
|
249
|
+
- use a smaller model
|
|
250
|
+
- scrape fewer sites
|
|
251
|
+
- use --no-js flag
|
|
252
|
+
- get better hardware lol
|
|
253
|
+
|
|
254
|
+
## pro tips
|
|
255
|
+
|
|
256
|
+
- always check robots.txt before scraping (be respectful)
|
|
257
|
+
- start with --no-js if you just need content
|
|
258
|
+
- codellama is best for technical analysis
|
|
259
|
+
- mistral is faster than llama3
|
|
260
|
+
- use custom output dirs for different projects
|
|
261
|
+
|
|
262
|
+
## what's inside
|
|
263
|
+
|
|
264
|
+
```
|
|
265
|
+
protor/
|
|
266
|
+
├── cli.py # command interface
|
|
267
|
+
├── scraper.py # does the scraping
|
|
268
|
+
├── analyzer.py # talks to ollama
|
|
269
|
+
└── utils.py # helper stuff
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## customize it
|
|
273
|
+
|
|
274
|
+
want different analysis prompts? edit the `ANALYSIS_PROMPTS` in `protor/analyzer.py`
|
|
275
|
+
|
|
276
|
+
need different rate limits? check `protor/scraper.py` (0.3s between js files, 1s between sites)
|
|
277
|
+
|
|
278
|
+
## legal stuff
|
|
279
|
+
|
|
280
|
+
mit license. do whatever you want with it.
|
|
281
|
+
|
|
282
|
+
just don't be weird and scrape sites that explicitly say no. respect robots.txt. don't ddos anyone. you know, basic internet etiquette.
|
|
283
|
+
|
|
284
|
+
## tech stack
|
|
285
|
+
|
|
286
|
+
- ollama (local llm inference)
|
|
287
|
+
- beautifulsoup (html parsing)
|
|
288
|
+
- requests (http stuff)
|
|
289
|
+
- curl (the goat)
|
|
290
|
+
|
|
291
|
+
---
|
|
292
|
+
|
|
293
|
+
built because web scraping shouldn't require a phd or a credit card
|
|
294
|
+
|
|
295
|
+
made with spite and caffeine
|
|
296
|
+
|
|
297
|
+
star it if it's useful idk
|
protor-2.0.0/README.md
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
# protor
|
|
2
|
+
|
|
3
|
+
> scrape websites. analyze with ai. no bs.
|
|
4
|
+
|
|
5
|
+
a cli tool that actually works. scrapes web content with curl, feeds it to your local ollama models, gets insights. that's it.
|
|
6
|
+
|
|
7
|
+
## why this exists
|
|
8
|
+
|
|
9
|
+
because paying for web scraping apis is kinda mid when you can just use curl and a local llm. also because sometimes you need to analyze a bunch of sites and doing it manually is literally painful.
|
|
10
|
+
|
|
11
|
+
## what you need
|
|
12
|
+
|
|
13
|
+
- python 3.8+ (obviously)
|
|
14
|
+
- curl (you probably have it)
|
|
15
|
+
- [ollama](https://ollama.ai) running locally
|
|
16
|
+
|
|
17
|
+
### get ollama set up
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
# grab some models
|
|
21
|
+
ollama pull llama3
|
|
22
|
+
ollama pull mistral
|
|
23
|
+
ollama pull codellama
|
|
24
|
+
|
|
25
|
+
# start the server
|
|
26
|
+
ollama serve
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## install
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
# clone this
|
|
33
|
+
git clone <your-repo-url>
|
|
34
|
+
cd basic-llm-web-scraper
|
|
35
|
+
|
|
36
|
+
# install it
|
|
37
|
+
pip install -e .
|
|
38
|
+
|
|
39
|
+
# or just
|
|
40
|
+
pip install -r requirements.txt
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## how to use
|
|
44
|
+
|
|
45
|
+
### see what models you have
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
protor models
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### scrape stuff
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
# one site
|
|
55
|
+
protor scrape https://example.com
|
|
56
|
+
|
|
57
|
+
# multiple sites
|
|
58
|
+
protor scrape https://example.com https://another-site.com
|
|
59
|
+
|
|
60
|
+
# skip the js files if you want
|
|
61
|
+
protor scrape https://example.com --no-js
|
|
62
|
+
|
|
63
|
+
# custom settings
|
|
64
|
+
protor scrape https://example.com --output my_data --timeout 60
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### analyze what you scraped
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# general vibes check
|
|
71
|
+
protor analyze
|
|
72
|
+
|
|
73
|
+
# tech stack deep dive
|
|
74
|
+
protor analyze --focus technical --model codellama
|
|
75
|
+
|
|
76
|
+
# seo audit
|
|
77
|
+
protor analyze --focus seo --model mistral
|
|
78
|
+
|
|
79
|
+
# content analysis
|
|
80
|
+
protor analyze --focus content
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### do both at once (recommended)
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# basic usage
|
|
87
|
+
protor run https://example.com
|
|
88
|
+
|
|
89
|
+
# with options
|
|
90
|
+
protor run https://example.com https://another.com --model llama3 --focus technical
|
|
91
|
+
|
|
92
|
+
# go crazy
|
|
93
|
+
protor run https://site1.com https://site2.com https://site3.com \
|
|
94
|
+
--model mistral \
|
|
95
|
+
--focus seo \
|
|
96
|
+
--no-js
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## what the focus modes do
|
|
100
|
+
|
|
101
|
+
- **general** - overall content, main themes, what the site's about
|
|
102
|
+
- **technical** - frameworks, tech stack, how it's built
|
|
103
|
+
- **content** - writing quality, structure, how readable it is
|
|
104
|
+
- **seo** - meta tags, optimization stuff, what needs fixing
|
|
105
|
+
|
|
106
|
+
## what you get
|
|
107
|
+
|
|
108
|
+
### after scraping
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
data/
|
|
112
|
+
├── example_com/
|
|
113
|
+
│ ├── index.html # the actual html
|
|
114
|
+
│ ├── manifest.json # metadata and stuff
|
|
115
|
+
│ └── js/ # javascript files
|
|
116
|
+
└── sites_index.json # summary of everything
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### after analysis
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
analysis/
|
|
123
|
+
├── README.md # readable report
|
|
124
|
+
└── analysis.json # raw data
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## real examples
|
|
128
|
+
|
|
129
|
+
### quick content check
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
protor run https://blog.example.com --focus content
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### technical audit
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
# grab everything including js
|
|
139
|
+
protor scrape https://webapp.example.com
|
|
140
|
+
|
|
141
|
+
# analyze the tech
|
|
142
|
+
protor analyze --focus technical --model codellama
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### competitor research
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
# scrape competitors
|
|
149
|
+
protor scrape https://competitor1.com https://competitor2.com https://competitor3.com
|
|
150
|
+
|
|
151
|
+
# get seo insights
|
|
152
|
+
protor analyze --focus seo --model mistral
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### batch analysis
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
protor run \
|
|
159
|
+
https://source1.com \
|
|
160
|
+
https://source2.com \
|
|
161
|
+
https://source3.com \
|
|
162
|
+
--model llama3
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## when stuff breaks
|
|
166
|
+
|
|
167
|
+
### ollama issues
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
# make sure it's running
|
|
171
|
+
ollama serve
|
|
172
|
+
|
|
173
|
+
# check your models
|
|
174
|
+
ollama list
|
|
175
|
+
|
|
176
|
+
# pull a model if needed
|
|
177
|
+
ollama pull llama3
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### curl failing
|
|
181
|
+
|
|
182
|
+
```bash
|
|
183
|
+
# test manually
|
|
184
|
+
curl -sL https://example.com
|
|
185
|
+
|
|
186
|
+
# try longer timeout
|
|
187
|
+
protor scrape https://example.com --timeout 120
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### analysis taking forever
|
|
191
|
+
|
|
192
|
+
- use a smaller model
|
|
193
|
+
- scrape fewer sites
|
|
194
|
+
- use --no-js flag
|
|
195
|
+
- get better hardware lol
|
|
196
|
+
|
|
197
|
+
## pro tips
|
|
198
|
+
|
|
199
|
+
- always check robots.txt before scraping (be respectful)
|
|
200
|
+
- start with --no-js if you just need content
|
|
201
|
+
- codellama is best for technical analysis
|
|
202
|
+
- mistral is faster than llama3
|
|
203
|
+
- use custom output dirs for different projects
|
|
204
|
+
|
|
205
|
+
## what's inside
|
|
206
|
+
|
|
207
|
+
```
|
|
208
|
+
protor/
|
|
209
|
+
├── cli.py # command interface
|
|
210
|
+
├── scraper.py # does the scraping
|
|
211
|
+
├── analyzer.py # talks to ollama
|
|
212
|
+
└── utils.py # helper stuff
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## customize it
|
|
216
|
+
|
|
217
|
+
want different analysis prompts? edit the `ANALYSIS_PROMPTS` in `protor/analyzer.py`
|
|
218
|
+
|
|
219
|
+
need different rate limits? check `protor/scraper.py` (0.3s between js files, 1s between sites)
|
|
220
|
+
|
|
221
|
+
## legal stuff
|
|
222
|
+
|
|
223
|
+
mit license. do whatever you want with it.
|
|
224
|
+
|
|
225
|
+
just don't be weird and scrape sites that explicitly say no. respect robots.txt. don't ddos anyone. you know, basic internet etiquette.
|
|
226
|
+
|
|
227
|
+
## tech stack
|
|
228
|
+
|
|
229
|
+
- ollama (local llm inference)
|
|
230
|
+
- beautifulsoup (html parsing)
|
|
231
|
+
- requests (http stuff)
|
|
232
|
+
- curl (the goat)
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
built because web scraping shouldn't require a phd or a credit card
|
|
237
|
+
|
|
238
|
+
made with spite and caffeine
|
|
239
|
+
|
|
240
|
+
star it if it's useful idk
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
protor — async web scraper and AI analyzer.
|
|
3
|
+
|
|
4
|
+
Quickstart
|
|
5
|
+
----------
|
|
6
|
+
from protor import scrape_multiple, analyze_with_ollama
|
|
7
|
+
|
|
8
|
+
index = scrape_multiple(["https://example.com"])
|
|
9
|
+
analyze_with_ollama(index, model="llama3", focus="general")
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
__version__ = version("protor")
|
|
16
|
+
except PackageNotFoundError:
|
|
17
|
+
__version__ = "dev"
|
|
18
|
+
|
|
19
|
+
__all__ = ["__version__"]
|