ispider 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ispider-0.1.0/LICENCE +7 -0
- ispider-0.1.0/MANIFEST.in +3 -0
- ispider-0.1.0/PKG-INFO +302 -0
- ispider-0.1.0/README.md +278 -0
- ispider-0.1.0/ispider.egg-info/SOURCES.txt +40 -0
- ispider-0.1.0/ispider_core/__init__.py +1 -0
- ispider-0.1.0/ispider_core/__main__.py +30 -0
- ispider-0.1.0/ispider_core/addons/out_parser_full_to_json.py +102 -0
- ispider-0.1.0/ispider_core/crawlers/cls_controllers.py +155 -0
- ispider-0.1.0/ispider_core/crawlers/cls_queue_out.py +111 -0
- ispider-0.1.0/ispider_core/crawlers/cls_seen_filter.py +73 -0
- ispider-0.1.0/ispider_core/crawlers/http_client.py +193 -0
- ispider-0.1.0/ispider_core/crawlers/http_filters.py +52 -0
- ispider-0.1.0/ispider_core/crawlers/stage1_crawl.py +188 -0
- ispider-0.1.0/ispider_core/crawlers/stage1_crawl_helpers.py +75 -0
- ispider-0.1.0/ispider_core/crawlers/stage2_spider.py +195 -0
- ispider-0.1.0/ispider_core/crawlers/thread_queue_in.py +81 -0
- ispider-0.1.0/ispider_core/crawlers/thread_save_finished.py +67 -0
- ispider-0.1.0/ispider_core/crawlers/thread_stats.py +99 -0
- ispider-0.1.0/ispider_core/ispider.py +76 -0
- ispider-0.1.0/ispider_core/orchestrator.py +37 -0
- ispider-0.1.0/ispider_core/parsers/email_parser.py +0 -0
- ispider-0.1.0/ispider_core/parsers/filetype_parser.py +42 -0
- ispider-0.1.0/ispider_core/parsers/html_parser.py +148 -0
- ispider-0.1.0/ispider_core/parsers/sitemaps_parser.py +94 -0
- ispider-0.1.0/ispider_core/parsers/social_parser.py +0 -0
- ispider-0.1.0/ispider_core/settings.py +38 -0
- ispider-0.1.0/ispider_core/storage/json_storage.py +0 -0
- ispider-0.1.0/ispider_core/utils/controllers.py +5 -0
- ispider-0.1.0/ispider_core/utils/domains.py +22 -0
- ispider-0.1.0/ispider_core/utils/efiles.py +29 -0
- ispider-0.1.0/ispider_core/utils/filters.py +1 -0
- ispider-0.1.0/ispider_core/utils/headers.py +39 -0
- ispider-0.1.0/ispider_core/utils/ifiles.py +77 -0
- ispider-0.1.0/ispider_core/utils/logger.py +54 -0
- ispider-0.1.0/ispider_core/utils/menu.py +43 -0
- ispider-0.1.0/ispider_core/utils/queues.py +63 -0
- ispider-0.1.0/ispider_core/utils/resume.py +89 -0
- ispider-0.1.0/pyproject.toml +36 -0
- ispider-0.1.0/setup.cfg +4 -0
- ispider-0.1.0/tests/10_scrape_websites.py +27 -0
- ispider-0.1.0/tests/scrape_njorg.py +31 -0
- ispider-0.1.0/tests/test_run.py +5 -0
ispider-0.1.0/LICENCE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright 2025 DANIELE RUGGINENTI
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
ispider-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ispider
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A high-speed web spider for massive scraping.
|
|
5
|
+
Author-email: Daniele Rugginenti <daniele.rugginenti@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENCE
|
|
9
|
+
Requires-Dist: aiohttp
|
|
10
|
+
Requires-Dist: beautifulsoup4
|
|
11
|
+
Requires-Dist: lxml
|
|
12
|
+
Requires-Dist: tqdm
|
|
13
|
+
Requires-Dist: requests
|
|
14
|
+
Requires-Dist: httpx
|
|
15
|
+
Requires-Dist: nslookup
|
|
16
|
+
Requires-Dist: tldextract
|
|
17
|
+
Requires-Dist: concurrent_log_handler
|
|
18
|
+
Requires-Dist: colorlog
|
|
19
|
+
Requires-Dist: brotli
|
|
20
|
+
Requires-Dist: validators
|
|
21
|
+
Requires-Dist: w3lib
|
|
22
|
+
Requires-Dist: pybloom_live
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# ispider_core
|
|
26
|
+
|
|
27
|
+
# V0.1
|
|
28
|
+
|
|
29
|
+
### Help
|
|
30
|
+
Show all the options
|
|
31
|
+
```
|
|
32
|
+
python3 run.py --help
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Crawl - PIPELINE STEP 1
|
|
36
|
+
You can specify a input file with the full path. File must contains the field ***domain*** (if no protocol specified, https will be used as default)
|
|
37
|
+
```
|
|
38
|
+
python3 run.py --crawl -file commons/inputs/size_report_urls.csv
|
|
39
|
+
```
|
|
40
|
+
You can specify a input file in the **commons/input** folder
|
|
41
|
+
```
|
|
42
|
+
python3 run.py --crawl -file size_report_urls.csv
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
You can specify one url, with or without the protocol (https://)
|
|
46
|
+
```
|
|
47
|
+
python3 run.py --crawl -one 'capitolawatches.com'
|
|
48
|
+
```
|
|
49
|
+
You can specify a subfolder to dump all file.
|
|
50
|
+
Everything will be saved in dumps/xxx and will be available and independent from other subfolders
|
|
51
|
+
```
|
|
52
|
+
python3 run.py --crawl -file FILE -sub-folder SUBFOLDER
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
This is a working command used on the server, for v1.3
|
|
56
|
+
```
|
|
57
|
+
python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 2 -proc 2 -dns-server 127.0.0.53
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**Tested with more processes, faster, good in retrieving and retry/error corrections in v1.4**
|
|
61
|
+
```
|
|
62
|
+
python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 16 -proc 16 -dns-server 127.0.0.53
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Parse
|
|
66
|
+
##### Parse ALL
|
|
67
|
+
Those script are configured to parse all and insert in DB, test or prod, depending on the script used
|
|
68
|
+
You can define the subfolder SUBFOLDER where to get the data
|
|
69
|
+
- TEST
|
|
70
|
+
- `./parse_test.sh SUBFOLDER`
|
|
71
|
+
|
|
72
|
+
- PROD
|
|
73
|
+
- `./parse_prod.sh SUBFOLDER`
|
|
74
|
+
|
|
75
|
+
##### PIPELINE STEP 2
|
|
76
|
+
This to create a report for the connections metadata
|
|
77
|
+
```
|
|
78
|
+
python3 run.py --parse -pools 24 -proc 24 conn
|
|
79
|
+
```
|
|
80
|
+
##### PIPELINE STEP 3
|
|
81
|
+
This to parse the landing page
|
|
82
|
+
```
|
|
83
|
+
python3 run.py --parse -pools 24 -proc 24 landing
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
##### PIPELINE STEP 4 - EMAILS
|
|
87
|
+
0. *STAGE 0*
|
|
88
|
+
Extract all emails from html pages
|
|
89
|
+
Create emails csv stage 0 in output, and st0 jsons in dump folder
|
|
90
|
+
```
|
|
91
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st0
|
|
92
|
+
```
|
|
93
|
+
1. *STAGE 1*
|
|
94
|
+
From jsons produced by **st0**,
|
|
95
|
+
**group by email**, counts how many domains contains it
|
|
96
|
+
produce csv in output and st1 jsons in the dump folder
|
|
97
|
+
```
|
|
98
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st1
|
|
99
|
+
```
|
|
100
|
+
2. *STAGE 2*
|
|
101
|
+
From jsons produced by **st0**,
|
|
102
|
+
**Email Classification**
|
|
103
|
+
produce csv and st2 jsons
|
|
104
|
+
**DNS Server needed for email_domain resolution**
|
|
105
|
+
```
|
|
106
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER -dns-server 127.0.0.53 emails-st2
|
|
107
|
+
```
|
|
108
|
+
3. *STAGE 3*
|
|
109
|
+
From jsons produced by **st2**
|
|
110
|
+
extract all usable emails (is_usable is True)
|
|
111
|
+
produce a csv and **st3** jsons
|
|
112
|
+
```
|
|
113
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st3
|
|
114
|
+
```
|
|
115
|
+
4. *STAGE 4*
|
|
116
|
+
**DB Insert**
|
|
117
|
+
```
|
|
118
|
+
xtra='-sub-folder SUBFOLDER --just-emails'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
To execute all stages:
|
|
122
|
+
```
|
|
123
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st-all
|
|
124
|
+
xtra='-sub-folder SUBFOLDER --just-emails'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
##### PIPELINE STEP 5
|
|
128
|
+
```
|
|
129
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER socials
|
|
130
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-internal
|
|
131
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-sitemaps
|
|
132
|
+
```
|
|
133
|
+
##### PIPELINE STEP 6
|
|
134
|
+
Will join all the companies based on final_url_domain, shopid, etc
|
|
135
|
+
```
|
|
136
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
#### PIPELINE STEP 7
|
|
140
|
+
Company Names St0
|
|
141
|
+
A funcPermNames = [
|
|
142
|
+
removeNone, removeHyphen, removeQuote, removeSpace, removeDot, removeComma,
|
|
143
|
+
removeEmojis, replaceUnidecode,
|
|
144
|
+
replaceAnd, replaceUnd, replaceY, removeAnd,
|
|
145
|
+
replaceSpecial
|
|
146
|
+
];
|
|
147
|
+
was used,
|
|
148
|
+
and a "combination without repetition" function to call those functions on the 'site_name_cleaned' from the landing page, in the order specified (for the case of the &).
|
|
149
|
+
|
|
150
|
+
So when company_name_classification = 'cf_name_no_dots_no_spaces_no_comma_replace_special_dom_no_hyphen'
|
|
151
|
+
that means something like
|
|
152
|
+
site_name_cleaned = 'Wel.do,ne s'a Cement™'
|
|
153
|
+
final_domain = "welldone-cement.com"
|
|
154
|
+
|
|
155
|
+
so in the string
|
|
156
|
+
-cf is always when match
|
|
157
|
+
-name_ it's everything related to name
|
|
158
|
+
-dom_ it's everything related to dom
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
##### PIPELINE STEP FINAL
|
|
162
|
+
### DB Insert
|
|
163
|
+
- To recreate tables and insert all CSV in DB ***ecomm_test***
|
|
164
|
+
```
|
|
165
|
+
python3 run.py dbdt; python3 run.py dbct; python3 run.py dbi;
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
- To insert all CSV in DB ***ecomm_prod***
|
|
169
|
+
```
|
|
170
|
+
python3 run.py --prod dbdt; python3 run.py --prod dbct; python3 run.py --prod dbi;
|
|
171
|
+
```
|
|
172
|
+
- To insert all CSV in DB ***ecomm_prod*** from some **SUBFOLDER**
|
|
173
|
+
```
|
|
174
|
+
python3 run.py --prod dbdt; python3 run.py --prod dbct; python3 run.py -sub-folder --prod dbi;
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
# EXTRA FUNCTIONS
|
|
178
|
+
Check also the settings.py file for extra configuration, as
|
|
179
|
+
- proxy to be used
|
|
180
|
+
- async block size
|
|
181
|
+
- number of retries on error
|
|
182
|
+
etc.
|
|
183
|
+
|
|
184
|
+
##### Help
|
|
185
|
+
```
|
|
186
|
+
python3 run.py --help
|
|
187
|
+
```
|
|
188
|
+
will show all the available options of the script
|
|
189
|
+
|
|
190
|
+
##### Pools and Procs
|
|
191
|
+
- `-pools 4` will execute the script on 4 different cores, if available. Script will be spanned in 4 different processes
|
|
192
|
+
- `-proc 4` number of workers, should be **always a multiple of pools** to correctly distribute the job against different pools
|
|
193
|
+
|
|
194
|
+
##### DNS Server
|
|
195
|
+
`-dns-server 141.1.1.1` This is a DNS server that will be used when IP is not retrievable by httpx module. UDP output needs to be opened
|
|
196
|
+
|
|
197
|
+
This is a normal execution on the *bigbadboy* server to retrieve DNS information, because outbound UDP against google dns are blocked
|
|
198
|
+
```
|
|
199
|
+
python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 2 -proc 2 -dns-server 127.0.0.53
|
|
200
|
+
```
|
|
201
|
+
##### Proxy
|
|
202
|
+
`--force-proxy` it will force to use the proxy PROXY_TO_USE set up in *settings.py* file
|
|
203
|
+
|
|
204
|
+
##### Test
|
|
205
|
+
`--test` will scrape just a porcentage of the domains in input
|
|
206
|
+
The porcentage that will be scraped depend on the parameter PORCENTAGE_TO_SCRAPE_IN_TEST defined in *settings.py*
|
|
207
|
+
|
|
208
|
+
##### SUB_FOLDERS
|
|
209
|
+
`-sub-folder SUBFOLDER` will use a different configuration for
|
|
210
|
+
- output folder
|
|
211
|
+
- dump folder
|
|
212
|
+
|
|
213
|
+
For consistence, this flag must be specified in ***db insert*** too
|
|
214
|
+
|
|
215
|
+
### All pipeline
|
|
216
|
+
So, the whole pipeline will become
|
|
217
|
+
```
|
|
218
|
+
python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 2 -proc 2 -dns-server 127.0.0.53 -sub-folder SUBFOLDER
|
|
219
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER conn
|
|
220
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER landings
|
|
221
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st0
|
|
222
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st1
|
|
223
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER -dns-server 127.0.0.53 emails-st2
|
|
224
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st3
|
|
225
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER socials
|
|
226
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-internal
|
|
227
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-sitemaps
|
|
228
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st0
|
|
229
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st1
|
|
230
|
+
python3 run.py -sub-folder SUBFOLDER dbdt; python3 run.py -sub-folder SUBFOLDER dbct; python3 run.py -sub-folder SUBFOLDER dbi;
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
# HIGH LEVEL OPERATIONS
|
|
235
|
+
### EMAILS PIPELINE
|
|
236
|
+
If you want to apply the complete pipeline just for **emails**,
|
|
237
|
+
it exists a flag --just-emails to apply to DB operations
|
|
238
|
+
|
|
239
|
+
- STEP1, recreate the output
|
|
240
|
+
```
|
|
241
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st0
|
|
242
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st1
|
|
243
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER -dns-server 127.0.0.53 emails-st2
|
|
244
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st3
|
|
245
|
+
```
|
|
246
|
+
- STEP2, recreate just the emails related tables: email_st0,email_st1,email_st2,email_st3
|
|
247
|
+
```
|
|
248
|
+
xtra='-sub-folder SUBFOLDER --just-emails'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
### COMPANIES
|
|
252
|
+
If you want to apply the complete pipeline just for **companies**,
|
|
253
|
+
it exists a flag --just-companies to apply to DB operations
|
|
254
|
+
|
|
255
|
+
1. recreate the output
|
|
256
|
+
```
|
|
257
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st0
|
|
258
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st1
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
2. recreate just the emails related tables: email_st0,email_st1,email_st2,email_st3
|
|
262
|
+
```
|
|
263
|
+
xtra='-sub-folder SUBFOLDER --just-companies'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
### COMPANIES EXCLUSION
|
|
268
|
+
Create a table 'companies_exclusion' with an extra field set as "excluded" if some **domain_plus_tld** present in exclusion table based on **customer_id**
|
|
269
|
+
To define exclusion table parameters, in settings.py
|
|
270
|
+
- MYSQL_EXCLUSION_DB = 'customer'
|
|
271
|
+
- MYSQL_EXCLUSION_TABLE = 'exclusion';
|
|
272
|
+
- MYSQL_OUTPUT_COLUMN_CUSTOMER_ID = 'customer_id'
|
|
273
|
+
- MYSQL_OUTPUT_COLUMN_DOMAIN_WITH_TLD = 'domain_cleaned'
|
|
274
|
+
|
|
275
|
+
1. To run the script, this one **create a csv** file in output folder
|
|
276
|
+
```
|
|
277
|
+
python3 -u run.py --parse -sub-folder 20230422 -customer-id **test** --exact companies-exclusion
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
2. To **insert in DB**,
|
|
281
|
+
```
|
|
282
|
+
xtra='-sub-folder **20230422** --just-companies-exclusion'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
## PARQUET FILE EXPORTER
|
|
287
|
+
- Under ecommerce_crawler/commons/scripts/parquets
|
|
288
|
+
- Search for parquets files under ecommerce_crawler/data/SUB-FOLDER/METHOD/files
|
|
289
|
+
- To select domains, accept in input *one file with full path*, file must be a csv and contains a domain column
|
|
290
|
+
OR
|
|
291
|
+
- To select domains, Accept in input a *domain name part*,
|
|
292
|
+
|
|
293
|
+
- Accept as input the **-sub-folder**, to specify the relative directory
|
|
294
|
+
- Accept as input the **method** (urls-st0 or urls-st1 ie)
|
|
295
|
+
|
|
296
|
+
`python3 pq.py -file inputs/all_woo_shopify_3k.csv -sub-folder ALL_WOO_GB urls-st0`
|
|
297
|
+
`python3 pq.py -file inputs/all_woo_shopify_3k.csv -sub-folder ALL_WOO_GB urls-st1`
|
|
298
|
+
|
|
299
|
+
`python3 pq.py -name-part "keys4" -sub-folder ALL_WOO_GB urls-st1`
|
|
300
|
+
|
|
301
|
+
- Output will be saved in ecommerce_crawler/commons/scripts/parquets/outputs
|
|
302
|
+
- Output will be saved on DB in a table name dependent of the method, as defined in settings
|
ispider-0.1.0/README.md
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
# ispider_core
|
|
2
|
+
|
|
3
|
+
# V0.1
|
|
4
|
+
|
|
5
|
+
### Help
|
|
6
|
+
Show all the options
|
|
7
|
+
```
|
|
8
|
+
python3 run.py --help
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
### Crawl - PIPELINE STEP 1
|
|
12
|
+
You can specify a input file with the full path. File must contains the field ***domain*** (if no protocol specified, https will be used as default)
|
|
13
|
+
```
|
|
14
|
+
python3 run.py --crawl -file commons/inputs/size_report_urls.csv
|
|
15
|
+
```
|
|
16
|
+
You can specify a input file in the **commons/input** folder
|
|
17
|
+
```
|
|
18
|
+
python3 run.py --crawl -file size_report_urls.csv
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
You can specify one url, with or without the protocol (https://)
|
|
22
|
+
```
|
|
23
|
+
python3 run.py --crawl -one 'capitolawatches.com'
|
|
24
|
+
```
|
|
25
|
+
You can specify a subfolder to dump all file.
|
|
26
|
+
Everything will be saved in dumps/xxx and will be available and independent from other subfolders
|
|
27
|
+
```
|
|
28
|
+
python3 run.py --crawl -file FILE -sub-folder SUBFOLDER
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
This is a working command used on the server, for v1.3
|
|
32
|
+
```
|
|
33
|
+
python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 2 -proc 2 -dns-server 127.0.0.53
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**Tested with more processes, faster, good in retrieving and retry/error corrections in v1.4**
|
|
37
|
+
```
|
|
38
|
+
python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 16 -proc 16 -dns-server 127.0.0.53
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Parse
|
|
42
|
+
##### Parse ALL
|
|
43
|
+
Those script are configured to parse all and insert in DB, test or prod, depending on the script used
|
|
44
|
+
You can define the subfolder SUBFOLDER where to get the data
|
|
45
|
+
- TEST
|
|
46
|
+
- `./parse_test.sh SUBFOLDER`
|
|
47
|
+
|
|
48
|
+
- PROD
|
|
49
|
+
- `./parse_prod.sh SUBFOLDER`
|
|
50
|
+
|
|
51
|
+
##### PIPELINE STEP 2
|
|
52
|
+
This to create a report for the connections metadata
|
|
53
|
+
```
|
|
54
|
+
python3 run.py --parse -pools 24 -proc 24 conn
|
|
55
|
+
```
|
|
56
|
+
##### PIPELINE STEP 3
|
|
57
|
+
This to parse the landing page
|
|
58
|
+
```
|
|
59
|
+
python3 run.py --parse -pools 24 -proc 24 landing
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
##### PIPELINE STEP 4 - EMAILS
|
|
63
|
+
0. *STAGE 0*
|
|
64
|
+
Extract all emails from html pages
|
|
65
|
+
Create emails csv stage 0 in output, and st0 jsons in dump folder
|
|
66
|
+
```
|
|
67
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st0
|
|
68
|
+
```
|
|
69
|
+
1. *STAGE 1*
|
|
70
|
+
From jsons produced by **st0**,
|
|
71
|
+
**group by email**, counts how many domains contains it
|
|
72
|
+
produce csv in output and st1 jsons in the dump folder
|
|
73
|
+
```
|
|
74
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st1
|
|
75
|
+
```
|
|
76
|
+
2. *STAGE 2*
|
|
77
|
+
From jsons produced by **st0**,
|
|
78
|
+
**Email Classification**
|
|
79
|
+
produce csv and st2 jsons
|
|
80
|
+
**DNS Server needed for email_domain resolution**
|
|
81
|
+
```
|
|
82
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER -dns-server 127.0.0.53 emails-st2
|
|
83
|
+
```
|
|
84
|
+
3. *STAGE 3*
|
|
85
|
+
From jsons produced by **st2**
|
|
86
|
+
extract all usable emails (is_usable is True)
|
|
87
|
+
produce a csv and **st3** jsons
|
|
88
|
+
```
|
|
89
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st3
|
|
90
|
+
```
|
|
91
|
+
4. *STAGE 4*
|
|
92
|
+
**DB Insert**
|
|
93
|
+
```
|
|
94
|
+
xtra='-sub-folder SUBFOLDER --just-emails'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
To execute all stages:
|
|
98
|
+
```
|
|
99
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st-all
|
|
100
|
+
xtra='-sub-folder SUBFOLDER --just-emails'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
##### PIPELINE STEP 5
|
|
104
|
+
```
|
|
105
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER socials
|
|
106
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-internal
|
|
107
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-sitemaps
|
|
108
|
+
```
|
|
109
|
+
##### PIPELINE STEP 6
|
|
110
|
+
Will join all the companies based on final_url_domain, shopid, etc
|
|
111
|
+
```
|
|
112
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
#### PIPELINE STEP 7
|
|
116
|
+
Company Names St0
|
|
117
|
+
A funcPermNames = [
|
|
118
|
+
removeNone, removeHyphen, removeQuote, removeSpace, removeDot, removeComma,
|
|
119
|
+
removeEmojis, replaceUnidecode,
|
|
120
|
+
replaceAnd, replaceUnd, replaceY, removeAnd,
|
|
121
|
+
replaceSpecial
|
|
122
|
+
];
|
|
123
|
+
was used,
|
|
124
|
+
and a "combination without repetition" function to call those functions on the 'site_name_cleaned' from the landing page, in the order specified (for the case of the &).
|
|
125
|
+
|
|
126
|
+
So when company_name_classification = 'cf_name_no_dots_no_spaces_no_comma_replace_special_dom_no_hyphen'
|
|
127
|
+
that means something like
|
|
128
|
+
site_name_cleaned = 'Wel.do,ne s'a Cement™'
|
|
129
|
+
final_domain = "welldone-cement.com"
|
|
130
|
+
|
|
131
|
+
so in the string
|
|
132
|
+
-cf is always when match
|
|
133
|
+
-name_ it's everything related to name
|
|
134
|
+
-dom_ it's everything related to dom
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
##### PIPELINE STEP FINAL
|
|
138
|
+
### DB Insert
|
|
139
|
+
- To recreate tables and insert all CSV in DB ***ecomm_test***
|
|
140
|
+
```
|
|
141
|
+
python3 run.py dbdt; python3 run.py dbct; python3 run.py dbi;
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
- To insert all CSV in DB ***ecomm_prod***
|
|
145
|
+
```
|
|
146
|
+
python3 run.py --prod dbdt; python3 run.py --prod dbct; python3 run.py --prod dbi;
|
|
147
|
+
```
|
|
148
|
+
- To insert all CSV in DB ***ecomm_prod*** from some **SUBFOLDER**
|
|
149
|
+
```
|
|
150
|
+
python3 run.py --prod dbdt; python3 run.py --prod dbct; python3 run.py -sub-folder --prod dbi;
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
# EXTRA FUNCTIONS
|
|
154
|
+
Check also the settings.py file for extra configuration, as
|
|
155
|
+
- proxy to be used
|
|
156
|
+
- async block size
|
|
157
|
+
- number of retries on error
|
|
158
|
+
etc.
|
|
159
|
+
|
|
160
|
+
##### Help
|
|
161
|
+
```
|
|
162
|
+
python3 run.py --help
|
|
163
|
+
```
|
|
164
|
+
will show all the available options of the script
|
|
165
|
+
|
|
166
|
+
##### Pools and Procs
|
|
167
|
+
- `-pools 4` will execute the script on 4 different cores, if available. Script will be spanned in 4 different processes
|
|
168
|
+
- `-proc 4` number of workers, should be **always a multiple of pools** to correctly distribute the job against different pools
|
|
169
|
+
|
|
170
|
+
##### DNS Server
|
|
171
|
+
`-dns-server 141.1.1.1` This is a DNS server that will be used when IP is not retrievable by httpx module. UDP output needs to be opened
|
|
172
|
+
|
|
173
|
+
This is a normal execution on the *bigbadboy* server to retrieve DNS information, because outbound UDP against google dns are blocked
|
|
174
|
+
```
|
|
175
|
+
python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 2 -proc 2 -dns-server 127.0.0.53
|
|
176
|
+
```
|
|
177
|
+
##### Proxy
|
|
178
|
+
`--force-proxy` it will force to use the proxy PROXY_TO_USE set up in *settings.py* file
|
|
179
|
+
|
|
180
|
+
##### Test
|
|
181
|
+
`--test` will scrape just a porcentage of the domains in input
|
|
182
|
+
The porcentage that will be scraped depend on the parameter PORCENTAGE_TO_SCRAPE_IN_TEST defined in *settings.py*
|
|
183
|
+
|
|
184
|
+
##### SUB_FOLDERS
|
|
185
|
+
`-sub-folder SUBFOLDER` will use a different configuration for
|
|
186
|
+
- output folder
|
|
187
|
+
- dump folder
|
|
188
|
+
|
|
189
|
+
For consistence, this flag must be specified in ***db insert*** too
|
|
190
|
+
|
|
191
|
+
### All pipeline
|
|
192
|
+
So, the whole pipeline will become
|
|
193
|
+
```
|
|
194
|
+
python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 2 -proc 2 -dns-server 127.0.0.53 -sub-folder SUBFOLDER
|
|
195
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER conn
|
|
196
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER landings
|
|
197
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st0
|
|
198
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st1
|
|
199
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER -dns-server 127.0.0.53 emails-st2
|
|
200
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st3
|
|
201
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER socials
|
|
202
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-internal
|
|
203
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-sitemaps
|
|
204
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st0
|
|
205
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st1
|
|
206
|
+
python3 run.py -sub-folder SUBFOLDER dbdt; python3 run.py -sub-folder SUBFOLDER dbct; python3 run.py -sub-folder SUBFOLDER dbi;
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
# HIGH LEVEL OPERATIONS
|
|
211
|
+
### EMAILS PIPELINE
|
|
212
|
+
If you want to apply the complete pipeline just for **emails**,
|
|
213
|
+
it exists a flag --just-emails to apply to DB operations
|
|
214
|
+
|
|
215
|
+
- STEP1, recreate the output
|
|
216
|
+
```
|
|
217
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st0
|
|
218
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st1
|
|
219
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER -dns-server 127.0.0.53 emails-st2
|
|
220
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st3
|
|
221
|
+
```
|
|
222
|
+
- STEP2, recreate just the emails related tables: email_st0,email_st1,email_st2,email_st3
|
|
223
|
+
```
|
|
224
|
+
xtra='-sub-folder SUBFOLDER --just-emails'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### COMPANIES
|
|
228
|
+
If you want to apply the complete pipeline just for **companies**,
|
|
229
|
+
it exists a flag --just-companies to apply to DB operations
|
|
230
|
+
|
|
231
|
+
1. recreate the output
|
|
232
|
+
```
|
|
233
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st0
|
|
234
|
+
python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st1
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
2. recreate just the emails related tables: email_st0,email_st1,email_st2,email_st3
|
|
238
|
+
```
|
|
239
|
+
xtra='-sub-folder SUBFOLDER --just-companies'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
### COMPANIES EXCLUSION
|
|
244
|
+
Create a table 'companies_exclusion' with an extra field set as "excluded" if some **domain_plus_tld** present in exclusion table based on **customer_id**
|
|
245
|
+
To define exclusion table parameters, in settings.py
|
|
246
|
+
- MYSQL_EXCLUSION_DB = 'customer'
|
|
247
|
+
- MYSQL_EXCLUSION_TABLE = 'exclusion';
|
|
248
|
+
- MYSQL_OUTPUT_COLUMN_CUSTOMER_ID = 'customer_id'
|
|
249
|
+
- MYSQL_OUTPUT_COLUMN_DOMAIN_WITH_TLD = 'domain_cleaned'
|
|
250
|
+
|
|
251
|
+
1. To run the script, this one **create a csv** file in output folder
|
|
252
|
+
```
|
|
253
|
+
python3 -u run.py --parse -sub-folder 20230422 -customer-id **test** --exact companies-exclusion
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
2. To **insert in DB**,
|
|
257
|
+
```
|
|
258
|
+
xtra='-sub-folder **20230422** --just-companies-exclusion'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
## PARQUET FILE EXPORTER
|
|
263
|
+
- Under ecommerce_crawler/commons/scripts/parquets
|
|
264
|
+
- Search for parquets files under ecommerce_crawler/data/SUB-FOLDER/METHOD/files
|
|
265
|
+
- To select domains, accept in input *one file with full path*, file must be a csv and contains a domain column
|
|
266
|
+
OR
|
|
267
|
+
- To select domains, Accept in input a *domain name part*,
|
|
268
|
+
|
|
269
|
+
- Accept as input the **-sub-folder**, to specify the relative directory
|
|
270
|
+
- Accept as input the **method** (urls-st0 or urls-st1 ie)
|
|
271
|
+
|
|
272
|
+
`python3 pq.py -file inputs/all_woo_shopify_3k.csv -sub-folder ALL_WOO_GB urls-st0`
|
|
273
|
+
`python3 pq.py -file inputs/all_woo_shopify_3k.csv -sub-folder ALL_WOO_GB urls-st1`
|
|
274
|
+
|
|
275
|
+
`python3 pq.py -name-part "keys4" -sub-folder ALL_WOO_GB urls-st1`
|
|
276
|
+
|
|
277
|
+
- Output will be saved in ecommerce_crawler/commons/scripts/parquets/outputs
|
|
278
|
+
- Output will be saved on DB in a table name dependent of the method, as defined in settings
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
LICENCE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
ispider_core/__init__.py
|
|
6
|
+
ispider_core/__main__.py
|
|
7
|
+
ispider_core/ispider.py
|
|
8
|
+
ispider_core/orchestrator.py
|
|
9
|
+
ispider_core/settings.py
|
|
10
|
+
ispider_core/addons/out_parser_full_to_json.py
|
|
11
|
+
ispider_core/crawlers/cls_controllers.py
|
|
12
|
+
ispider_core/crawlers/cls_queue_out.py
|
|
13
|
+
ispider_core/crawlers/cls_seen_filter.py
|
|
14
|
+
ispider_core/crawlers/http_client.py
|
|
15
|
+
ispider_core/crawlers/http_filters.py
|
|
16
|
+
ispider_core/crawlers/stage1_crawl.py
|
|
17
|
+
ispider_core/crawlers/stage1_crawl_helpers.py
|
|
18
|
+
ispider_core/crawlers/stage2_spider.py
|
|
19
|
+
ispider_core/crawlers/thread_queue_in.py
|
|
20
|
+
ispider_core/crawlers/thread_save_finished.py
|
|
21
|
+
ispider_core/crawlers/thread_stats.py
|
|
22
|
+
ispider_core/parsers/email_parser.py
|
|
23
|
+
ispider_core/parsers/filetype_parser.py
|
|
24
|
+
ispider_core/parsers/html_parser.py
|
|
25
|
+
ispider_core/parsers/sitemaps_parser.py
|
|
26
|
+
ispider_core/parsers/social_parser.py
|
|
27
|
+
ispider_core/storage/json_storage.py
|
|
28
|
+
ispider_core/utils/controllers.py
|
|
29
|
+
ispider_core/utils/domains.py
|
|
30
|
+
ispider_core/utils/efiles.py
|
|
31
|
+
ispider_core/utils/filters.py
|
|
32
|
+
ispider_core/utils/headers.py
|
|
33
|
+
ispider_core/utils/ifiles.py
|
|
34
|
+
ispider_core/utils/logger.py
|
|
35
|
+
ispider_core/utils/menu.py
|
|
36
|
+
ispider_core/utils/queues.py
|
|
37
|
+
ispider_core/utils/resume.py
|
|
38
|
+
tests/10_scrape_websites.py
|
|
39
|
+
tests/scrape_njorg.py
|
|
40
|
+
tests/test_run.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .ispider import ISpider
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# ispider_core/__main__.py
|
|
2
|
+
import sys
|
|
3
|
+
from ispider_core.utils.menu import menu
|
|
4
|
+
from ispider_core.utils.logger import LoggerFactory
|
|
5
|
+
|
|
6
|
+
# Not used yet
|
|
7
|
+
if __name__ == "__main__":
|
|
8
|
+
args = menu()
|
|
9
|
+
if args.stage is None:
|
|
10
|
+
print("No valid stage selected. Use -h for help.")
|
|
11
|
+
sys.exit()
|
|
12
|
+
|
|
13
|
+
print("****** Not usable yet from command line")
|
|
14
|
+
print("****** Import modules instead")
|
|
15
|
+
quit()
|
|
16
|
+
|
|
17
|
+
if args.stage == 'stage1':
|
|
18
|
+
# Call stage1 function here
|
|
19
|
+
print("Running Stage 1...")
|
|
20
|
+
elif args.stage == 'stage2':
|
|
21
|
+
# Call stage2 function here
|
|
22
|
+
print("Running Stage 2...")
|
|
23
|
+
elif args.stage == 'stage3':
|
|
24
|
+
# Call stage3 function here
|
|
25
|
+
print("Running Stage 3...")
|
|
26
|
+
elif args.stage == 'stage4':
|
|
27
|
+
# Call stage4 function here
|
|
28
|
+
print("Running Stage 4...")
|
|
29
|
+
|
|
30
|
+
|