ispider 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. ispider-0.1.0/LICENCE +7 -0
  2. ispider-0.1.0/MANIFEST.in +3 -0
  3. ispider-0.1.0/PKG-INFO +302 -0
  4. ispider-0.1.0/README.md +278 -0
  5. ispider-0.1.0/ispider.egg-info/SOURCES.txt +40 -0
  6. ispider-0.1.0/ispider_core/__init__.py +1 -0
  7. ispider-0.1.0/ispider_core/__main__.py +30 -0
  8. ispider-0.1.0/ispider_core/addons/out_parser_full_to_json.py +102 -0
  9. ispider-0.1.0/ispider_core/crawlers/cls_controllers.py +155 -0
  10. ispider-0.1.0/ispider_core/crawlers/cls_queue_out.py +111 -0
  11. ispider-0.1.0/ispider_core/crawlers/cls_seen_filter.py +73 -0
  12. ispider-0.1.0/ispider_core/crawlers/http_client.py +193 -0
  13. ispider-0.1.0/ispider_core/crawlers/http_filters.py +52 -0
  14. ispider-0.1.0/ispider_core/crawlers/stage1_crawl.py +188 -0
  15. ispider-0.1.0/ispider_core/crawlers/stage1_crawl_helpers.py +75 -0
  16. ispider-0.1.0/ispider_core/crawlers/stage2_spider.py +195 -0
  17. ispider-0.1.0/ispider_core/crawlers/thread_queue_in.py +81 -0
  18. ispider-0.1.0/ispider_core/crawlers/thread_save_finished.py +67 -0
  19. ispider-0.1.0/ispider_core/crawlers/thread_stats.py +99 -0
  20. ispider-0.1.0/ispider_core/ispider.py +76 -0
  21. ispider-0.1.0/ispider_core/orchestrator.py +37 -0
  22. ispider-0.1.0/ispider_core/parsers/email_parser.py +0 -0
  23. ispider-0.1.0/ispider_core/parsers/filetype_parser.py +42 -0
  24. ispider-0.1.0/ispider_core/parsers/html_parser.py +148 -0
  25. ispider-0.1.0/ispider_core/parsers/sitemaps_parser.py +94 -0
  26. ispider-0.1.0/ispider_core/parsers/social_parser.py +0 -0
  27. ispider-0.1.0/ispider_core/settings.py +38 -0
  28. ispider-0.1.0/ispider_core/storage/json_storage.py +0 -0
  29. ispider-0.1.0/ispider_core/utils/controllers.py +5 -0
  30. ispider-0.1.0/ispider_core/utils/domains.py +22 -0
  31. ispider-0.1.0/ispider_core/utils/efiles.py +29 -0
  32. ispider-0.1.0/ispider_core/utils/filters.py +1 -0
  33. ispider-0.1.0/ispider_core/utils/headers.py +39 -0
  34. ispider-0.1.0/ispider_core/utils/ifiles.py +77 -0
  35. ispider-0.1.0/ispider_core/utils/logger.py +54 -0
  36. ispider-0.1.0/ispider_core/utils/menu.py +43 -0
  37. ispider-0.1.0/ispider_core/utils/queues.py +63 -0
  38. ispider-0.1.0/ispider_core/utils/resume.py +89 -0
  39. ispider-0.1.0/pyproject.toml +36 -0
  40. ispider-0.1.0/setup.cfg +4 -0
  41. ispider-0.1.0/tests/10_scrape_websites.py +27 -0
  42. ispider-0.1.0/tests/scrape_njorg.py +31 -0
  43. ispider-0.1.0/tests/test_run.py +5 -0
ispider-0.1.0/LICENCE ADDED
@@ -0,0 +1,7 @@
1
+ Copyright 2025 DANIELE RUGGINENTI
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,3 @@
1
+ exclude *.DS_Store
2
+ recursive-exclude __pycache__ *
3
+ recursive-exclude *.egg-info *
ispider-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,302 @@
1
+ Metadata-Version: 2.4
2
+ Name: ispider
3
+ Version: 0.1.0
4
+ Summary: A high-speed web spider for massive scraping.
5
+ Author-email: Daniele Rugginenti <daniele.rugginenti@gmail.com>
6
+ License-Expression: MIT
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENCE
9
+ Requires-Dist: aiohttp
10
+ Requires-Dist: beautifulsoup4
11
+ Requires-Dist: lxml
12
+ Requires-Dist: tqdm
13
+ Requires-Dist: requests
14
+ Requires-Dist: httpx
15
+ Requires-Dist: nslookup
16
+ Requires-Dist: tldextract
17
+ Requires-Dist: concurrent_log_handler
18
+ Requires-Dist: colorlog
19
+ Requires-Dist: brotli
20
+ Requires-Dist: validators
21
+ Requires-Dist: w3lib
22
+ Requires-Dist: pybloom_live
23
+ Dynamic: license-file
24
+
25
+ # ispider_core
26
+
27
+ # V0.1
28
+
29
+ ### Help
30
+ Show all the options
31
+ ```
32
+ python3 run.py --help
33
+ ```
34
+
35
+ ### Crawl - PIPELINE STEP 1
36
+ You can specify a input file with the full path. File must contains the field ***domain*** (if no protocol specified, https will be used as default)
37
+ ```
38
+ python3 run.py --crawl -file commons/inputs/size_report_urls.csv
39
+ ```
40
+ You can specify a input file in the **commons/input** folder
41
+ ```
42
+ python3 run.py --crawl -file size_report_urls.csv
43
+ ```
44
+
45
+ You can specify one url, with or without the protocol (https://)
46
+ ```
47
+ python3 run.py --crawl -one 'capitolawatches.com'
48
+ ```
49
+ You can specify a subfolder to dump all file.
50
+ Everything will be saved in dumps/xxx and will be available and independent from other subfolders
51
+ ```
52
+ python3 run.py --crawl -file FILE -sub-folder SUBFOLDER
53
+ ```
54
+
55
+ This is a working command used on the server, for v1.3
56
+ ```
57
+ python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 2 -proc 2 -dns-server 127.0.0.53
58
+ ```
59
+
60
+ **Tested with more processes, faster, good in retrieving and retry/error corrections in v1.4**
61
+ ```
62
+ python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 16 -proc 16 -dns-server 127.0.0.53
63
+ ```
64
+
65
+ ### Parse
66
+ ##### Parse ALL
67
+ Those script are configured to parse all and insert in DB, test or prod, depending on the script used
68
+ You can define the subfolder SUBFOLDER where to get the data
69
+ - TEST
70
+ - `./parse_test.sh SUBFOLDER`
71
+
72
+ - PROD
73
+ - `./parse_prod.sh SUBFOLDER`
74
+
75
+ ##### PIPELINE STEP 2
76
+ This to create a report for the connections metadata
77
+ ```
78
+ python3 run.py --parse -pools 24 -proc 24 conn
79
+ ```
80
+ ##### PIPELINE STEP 3
81
+ This to parse the landing page
82
+ ```
83
+ python3 run.py --parse -pools 24 -proc 24 landing
84
+ ```
85
+
86
+ ##### PIPELINE STEP 4 - EMAILS
87
+ 0. *STAGE 0*
88
+ Extract all emails from html pages
89
+ Create emails csv stage 0 in output, and st0 jsons in dump folder
90
+ ```
91
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st0
92
+ ```
93
+ 1. *STAGE 1*
94
+ From jsons produced by **st0**,
95
+ **group by email**, counts how many domains contains it
96
+ produce csv in output and st1 jsons in the dump folder
97
+ ```
98
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st1
99
+ ```
100
+ 2. *STAGE 2*
101
+ From jsons produced by **st0**,
102
+ **Email Classification**
103
+ produce csv and st2 jsons
104
+ **DNS Server needed for email_domain resolution**
105
+ ```
106
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER -dns-server 127.0.0.53 emails-st2
107
+ ```
108
+ 3. *STAGE 3*
109
+ From jsons produced by **st2**
110
+ extract all usable emails (is_usable is True)
111
+ produce a csv and **st3** jsons
112
+ ```
113
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st3
114
+ ```
115
+ 4. *STAGE 4*
116
+ **DB Insert**
117
+ ```
118
+ xtra='-sub-folder SUBFOLDER --just-emails'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
119
+ ```
120
+
121
+ To execute all stages:
122
+ ```
123
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st-all
124
+ xtra='-sub-folder SUBFOLDER --just-emails'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
125
+ ```
126
+
127
+ ##### PIPELINE STEP 5
128
+ ```
129
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER socials
130
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-internal
131
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-sitemaps
132
+ ```
133
+ ##### PIPELINE STEP 6
134
+ Will join all the companies based on final_url_domain, shopid, etc
135
+ ```
136
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies
137
+ ```
138
+
139
+ #### PIPELINE STEP 7
140
+ Company Names St0
141
+ A funcPermNames = [
142
+ removeNone, removeHyphen, removeQuote, removeSpace, removeDot, removeComma,
143
+ removeEmojis, replaceUnidecode,
144
+ replaceAnd, replaceUnd, replaceY, removeAnd,
145
+ replaceSpecial
146
+ ];
147
+ was used,
148
+ and a "combination without repetition" function to call those functions on the 'site_name_cleaned' from the landing page, in the order specified (for the case of the &).
149
+
150
+ So when company_name_classification = 'cf_name_no_dots_no_spaces_no_comma_replace_special_dom_no_hyphen'
151
+ that means something like
152
+ site_name_cleaned = 'Wel.do,ne s'a Cement™'
153
+ final_domain = "welldone-cement.com"
154
+
155
+ so in the string
156
+ -cf is always when match
157
+ -name_ it's everything related to name
158
+ -dom_ it's everything related to dom
159
+
160
+
161
+ ##### PIPELINE STEP FINAL
162
+ ### DB Insert
163
+ - To recreate tables and insert all CSV in DB ***ecomm_test***
164
+ ```
165
+ python3 run.py dbdt; python3 run.py dbct; python3 run.py dbi;
166
+ ```
167
+
168
+ - To insert all CSV in DB ***ecomm_prod***
169
+ ```
170
+ python3 run.py --prod dbdt; python3 run.py --prod dbct; python3 run.py --prod dbi;
171
+ ```
172
+ - To insert all CSV in DB ***ecomm_prod*** from some **SUBFOLDER**
173
+ ```
174
+ python3 run.py --prod dbdt; python3 run.py --prod dbct; python3 run.py -sub-folder --prod dbi;
175
+ ```
176
+
177
+ # EXTRA FUNCTIONS
178
+ Check also the settings.py file for extra configuration, as
179
+ - proxy to be used
180
+ - async block size
181
+ - number of retries on error
182
+ etc.
183
+
184
+ ##### Help
185
+ ```
186
+ python3 run.py --help
187
+ ```
188
+ will show all the available options of the script
189
+
190
+ ##### Pools and Procs
191
+ - `-pools 4` will execute the script on 4 different cores, if available. Script will be spanned in 4 different processes
192
+ - `-proc 4` number of workers, should be **always a multiple of pools** to correctly distribute the job against different pools
193
+
194
+ ##### DNS Server
195
+ `-dns-server 141.1.1.1` This is a DNS server that will be used when IP is not retrievable by httpx module. UDP output needs to be opened
196
+
197
+ This is a normal execution on the *bigbadboy* server to retrieve DNS information, because outbound UDP against google dns are blocked
198
+ ```
199
+ python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 2 -proc 2 -dns-server 127.0.0.53
200
+ ```
201
+ ##### Proxy
202
+ `--force-proxy` it will force to use the proxy PROXY_TO_USE set up in *settings.py* file
203
+
204
+ ##### Test
205
+ `--test` will scrape just a porcentage of the domains in input
206
+ The porcentage that will be scraped depend on the parameter PORCENTAGE_TO_SCRAPE_IN_TEST defined in *settings.py*
207
+
208
+ ##### SUB_FOLDERS
209
+ `-sub-folder SUBFOLDER` will use a different configuration for
210
+ - output folder
211
+ - dump folder
212
+
213
+ For consistence, this flag must be specified in ***db insert*** too
214
+
215
+ ### All pipeline
216
+ So, the whole pipeline will become
217
+ ```
218
+ python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 2 -proc 2 -dns-server 127.0.0.53 -sub-folder SUBFOLDER
219
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER conn
220
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER landings
221
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st0
222
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st1
223
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER -dns-server 127.0.0.53 emails-st2
224
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st3
225
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER socials
226
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-internal
227
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-sitemaps
228
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st0
229
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st1
230
+ python3 run.py -sub-folder SUBFOLDER dbdt; python3 run.py -sub-folder SUBFOLDER dbct; python3 run.py -sub-folder SUBFOLDER dbi;
231
+ ```
232
+
233
+
234
+ # HIGH LEVEL OPERATIONS
235
+ ### EMAILS PIPELINE
236
+ If you want to apply the complete pipeline just for **emails**,
237
+ it exists a flag --just-emails to apply to DB operations
238
+
239
+ - STEP1, recreate the output
240
+ ```
241
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st0
242
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st1
243
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER -dns-server 127.0.0.53 emails-st2
244
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st3
245
+ ```
246
+ - STEP2, recreate just the emails related tables: email_st0,email_st1,email_st2,email_st3
247
+ ```
248
+ xtra='-sub-folder SUBFOLDER --just-emails'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
249
+ ```
250
+
251
+ ### COMPANIES
252
+ If you want to apply the complete pipeline just for **companies**,
253
+ it exists a flag --just-companies to apply to DB operations
254
+
255
+ 1. recreate the output
256
+ ```
257
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st0
258
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st1
259
+ ```
260
+
261
+ 2. recreate just the emails related tables: email_st0,email_st1,email_st2,email_st3
262
+ ```
263
+ xtra='-sub-folder SUBFOLDER --just-companies'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
264
+ ```
265
+
266
+
267
+ ### COMPANIES EXCLUSION
268
+ Create a table 'companies_exclusion' with an extra field set as "excluded" if some **domain_plus_tld** present in exclusion table based on **customer_id**
269
+ To define exclusion table parameters, in settings.py
270
+ - MYSQL_EXCLUSION_DB = 'customer'
271
+ - MYSQL_EXCLUSION_TABLE = 'exclusion';
272
+ - MYSQL_OUTPUT_COLUMN_CUSTOMER_ID = 'customer_id'
273
+ - MYSQL_OUTPUT_COLUMN_DOMAIN_WITH_TLD = 'domain_cleaned'
274
+
275
+ 1. To run the script, this one **create a csv** file in output folder
276
+ ```
277
+ python3 -u run.py --parse -sub-folder 20230422 -customer-id **test** --exact companies-exclusion
278
+ ```
279
+
280
+ 2. To **insert in DB**,
281
+ ```
282
+ xtra='-sub-folder **20230422** --just-companies-exclusion'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
283
+ ```
284
+
285
+
286
+ ## PARQUET FILE EXPORTER
287
+ - Under ecommerce_crawler/commons/scripts/parquets
288
+ - Search for parquets files under ecommerce_crawler/data/SUB-FOLDER/METHOD/files
289
+ - To select domains, accept in input *one file with full path*, file must be a csv and contains a domain column
290
+ OR
291
+ - To select domains, Accept in input a *domain name part*,
292
+
293
+ - Accept as input the **-sub-folder**, to specify the relative directory
294
+ - Accept as input the **method** (urls-st0 or urls-st1 ie)
295
+
296
+ `python3 pq.py -file inputs/all_woo_shopify_3k.csv -sub-folder ALL_WOO_GB urls-st0`
297
+ `python3 pq.py -file inputs/all_woo_shopify_3k.csv -sub-folder ALL_WOO_GB urls-st1`
298
+
299
+ `python3 pq.py -name-part "keys4" -sub-folder ALL_WOO_GB urls-st1`
300
+
301
+ - Output will be saved in ecommerce_crawler/commons/scripts/parquets/outputs
302
+ - Output will be saved on DB in a table name dependent of the method, as defined in settings
@@ -0,0 +1,278 @@
1
+ # ispider_core
2
+
3
+ # V0.1
4
+
5
+ ### Help
6
+ Show all the options
7
+ ```
8
+ python3 run.py --help
9
+ ```
10
+
11
+ ### Crawl - PIPELINE STEP 1
12
+ You can specify a input file with the full path. File must contains the field ***domain*** (if no protocol specified, https will be used as default)
13
+ ```
14
+ python3 run.py --crawl -file commons/inputs/size_report_urls.csv
15
+ ```
16
+ You can specify a input file in the **commons/input** folder
17
+ ```
18
+ python3 run.py --crawl -file size_report_urls.csv
19
+ ```
20
+
21
+ You can specify one url, with or without the protocol (https://)
22
+ ```
23
+ python3 run.py --crawl -one 'capitolawatches.com'
24
+ ```
25
+ You can specify a subfolder to dump all file.
26
+ Everything will be saved in dumps/xxx and will be available and independent from other subfolders
27
+ ```
28
+ python3 run.py --crawl -file FILE -sub-folder SUBFOLDER
29
+ ```
30
+
31
+ This is a working command used on the server, for v1.3
32
+ ```
33
+ python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 2 -proc 2 -dns-server 127.0.0.53
34
+ ```
35
+
36
+ **Tested with more processes, faster, good in retrieving and retry/error corrections in v1.4**
37
+ ```
38
+ python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 16 -proc 16 -dns-server 127.0.0.53
39
+ ```
40
+
41
+ ### Parse
42
+ ##### Parse ALL
43
+ Those script are configured to parse all and insert in DB, test or prod, depending on the script used
44
+ You can define the subfolder SUBFOLDER where to get the data
45
+ - TEST
46
+ - `./parse_test.sh SUBFOLDER`
47
+
48
+ - PROD
49
+ - `./parse_prod.sh SUBFOLDER`
50
+
51
+ ##### PIPELINE STEP 2
52
+ This to create a report for the connections metadata
53
+ ```
54
+ python3 run.py --parse -pools 24 -proc 24 conn
55
+ ```
56
+ ##### PIPELINE STEP 3
57
+ This to parse the landing page
58
+ ```
59
+ python3 run.py --parse -pools 24 -proc 24 landing
60
+ ```
61
+
62
+ ##### PIPELINE STEP 4 - EMAILS
63
+ 0. *STAGE 0*
64
+ Extract all emails from html pages
65
+ Create emails csv stage 0 in output, and st0 jsons in dump folder
66
+ ```
67
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st0
68
+ ```
69
+ 1. *STAGE 1*
70
+ From jsons produced by **st0**,
71
+ **group by email**, counts how many domains contains it
72
+ produce csv in output and st1 jsons in the dump folder
73
+ ```
74
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st1
75
+ ```
76
+ 2. *STAGE 2*
77
+ From jsons produced by **st0**,
78
+ **Email Classification**
79
+ produce csv and st2 jsons
80
+ **DNS Server needed for email_domain resolution**
81
+ ```
82
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER -dns-server 127.0.0.53 emails-st2
83
+ ```
84
+ 3. *STAGE 3*
85
+ From jsons produced by **st2**
86
+ extract all usable emails (is_usable is True)
87
+ produce a csv and **st3** jsons
88
+ ```
89
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st3
90
+ ```
91
+ 4. *STAGE 4*
92
+ **DB Insert**
93
+ ```
94
+ xtra='-sub-folder SUBFOLDER --just-emails'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
95
+ ```
96
+
97
+ To execute all stages:
98
+ ```
99
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st-all
100
+ xtra='-sub-folder SUBFOLDER --just-emails'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
101
+ ```
102
+
103
+ ##### PIPELINE STEP 5
104
+ ```
105
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER socials
106
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-internal
107
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-sitemaps
108
+ ```
109
+ ##### PIPELINE STEP 6
110
+ Will join all the companies based on final_url_domain, shopid, etc
111
+ ```
112
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies
113
+ ```
114
+
115
+ #### PIPELINE STEP 7
116
+ Company Names St0
117
+ A funcPermNames = [
118
+ removeNone, removeHyphen, removeQuote, removeSpace, removeDot, removeComma,
119
+ removeEmojis, replaceUnidecode,
120
+ replaceAnd, replaceUnd, replaceY, removeAnd,
121
+ replaceSpecial
122
+ ];
123
+ was used,
124
+ and a "combination without repetition" function to call those functions on the 'site_name_cleaned' from the landing page, in the order specified (for the case of the &).
125
+
126
+ So when company_name_classification = 'cf_name_no_dots_no_spaces_no_comma_replace_special_dom_no_hyphen'
127
+ that means something like
128
+ site_name_cleaned = 'Wel.do,ne s'a Cement™'
129
+ final_domain = "welldone-cement.com"
130
+
131
+ so in the string
132
+ -cf is always when match
133
+ -name_ it's everything related to name
134
+ -dom_ it's everything related to dom
135
+
136
+
137
+ ##### PIPELINE STEP FINAL
138
+ ### DB Insert
139
+ - To recreate tables and insert all CSV in DB ***ecomm_test***
140
+ ```
141
+ python3 run.py dbdt; python3 run.py dbct; python3 run.py dbi;
142
+ ```
143
+
144
+ - To insert all CSV in DB ***ecomm_prod***
145
+ ```
146
+ python3 run.py --prod dbdt; python3 run.py --prod dbct; python3 run.py --prod dbi;
147
+ ```
148
+ - To insert all CSV in DB ***ecomm_prod*** from some **SUBFOLDER**
149
+ ```
150
+ python3 run.py --prod dbdt; python3 run.py --prod dbct; python3 run.py -sub-folder --prod dbi;
151
+ ```
152
+
153
+ # EXTRA FUNCTIONS
154
+ Check also the settings.py file for extra configuration, as
155
+ - proxy to be used
156
+ - async block size
157
+ - number of retries on error
158
+ etc.
159
+
160
+ ##### Help
161
+ ```
162
+ python3 run.py --help
163
+ ```
164
+ will show all the available options of the script
165
+
166
+ ##### Pools and Procs
167
+ - `-pools 4` will execute the script on 4 different cores, if available. Script will be spanned in 4 different processes
168
+ - `-proc 4` number of workers, should be **always a multiple of pools** to correctly distribute the job against different pools
169
+
170
+ ##### DNS Server
171
+ `-dns-server 141.1.1.1` This is a DNS server that will be used when IP is not retrievable by httpx module. UDP output needs to be opened
172
+
173
+ This is a normal execution on the *bigbadboy* server to retrieve DNS information, because outbound UDP against google dns are blocked
174
+ ```
175
+ python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 2 -proc 2 -dns-server 127.0.0.53
176
+ ```
177
+ ##### Proxy
178
+ `--force-proxy` it will force to use the proxy PROXY_TO_USE set up in *settings.py* file
179
+
180
+ ##### Test
181
+ `--test` will scrape just a porcentage of the domains in input
182
+ The porcentage that will be scraped depend on the parameter PORCENTAGE_TO_SCRAPE_IN_TEST defined in *settings.py*
183
+
184
+ ##### SUB_FOLDERS
185
+ `-sub-folder SUBFOLDER` will use a different configuration for
186
+ - output folder
187
+ - dump folder
188
+
189
+ For consistence, this flag must be specified in ***db insert*** too
190
+
191
+ ### All pipeline
192
+ So, the whole pipeline will become
193
+ ```
194
+ python3 -u run.py --crawl -file commons/inputs/shopify_100k.csv -pools 2 -proc 2 -dns-server 127.0.0.53 -sub-folder SUBFOLDER
195
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER conn
196
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER landings
197
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st0
198
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st1
199
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER -dns-server 127.0.0.53 emails-st2
200
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st3
201
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER socials
202
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-internal
203
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER url-sitemaps
204
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st0
205
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st1
206
+ python3 run.py -sub-folder SUBFOLDER dbdt; python3 run.py -sub-folder SUBFOLDER dbct; python3 run.py -sub-folder SUBFOLDER dbi;
207
+ ```
208
+
209
+
210
+ # HIGH LEVEL OPERATIONS
211
+ ### EMAILS PIPELINE
212
+ If you want to apply the complete pipeline just for **emails**,
213
+ it exists a flag --just-emails to apply to DB operations
214
+
215
+ - STEP1, recreate the output
216
+ ```
217
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st0
218
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st1
219
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER -dns-server 127.0.0.53 emails-st2
220
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER emails-st3
221
+ ```
222
+ - STEP2, recreate just the emails related tables: email_st0,email_st1,email_st2,email_st3
223
+ ```
224
+ xtra='-sub-folder SUBFOLDER --just-emails'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
225
+ ```
226
+
227
+ ### COMPANIES
228
+ If you want to apply the complete pipeline just for **companies**,
229
+ it exists a flag --just-companies to apply to DB operations
230
+
231
+ 1. recreate the output
232
+ ```
233
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st0
234
+ python3 run.py --parse -pools 24 -proc 24 -sub-folder SUBFOLDER companies-st1
235
+ ```
236
+
237
+ 2. recreate just the emails related tables: email_st0,email_st1,email_st2,email_st3
238
+ ```
239
+ xtra='-sub-folder SUBFOLDER --just-companies'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
240
+ ```
241
+
242
+
243
+ ### COMPANIES EXCLUSION
244
+ Create a table 'companies_exclusion' with an extra field set as "excluded" if some **domain_plus_tld** present in exclusion table based on **customer_id**
245
+ To define exclusion table parameters, in settings.py
246
+ - MYSQL_EXCLUSION_DB = 'customer'
247
+ - MYSQL_EXCLUSION_TABLE = 'exclusion';
248
+ - MYSQL_OUTPUT_COLUMN_CUSTOMER_ID = 'customer_id'
249
+ - MYSQL_OUTPUT_COLUMN_DOMAIN_WITH_TLD = 'domain_cleaned'
250
+
251
+ 1. To run the script, this one **create a csv** file in output folder
252
+ ```
253
+ python3 -u run.py --parse -sub-folder 20230422 -customer-id **test** --exact companies-exclusion
254
+ ```
255
+
256
+ 2. To **insert in DB**,
257
+ ```
258
+ xtra='-sub-folder **20230422** --just-companies-exclusion'; python3 run.py $xtra dbdt; python3 run.py $xtra dbct; python3 run.py $xtra dbi;
259
+ ```
260
+
261
+
262
+ ## PARQUET FILE EXPORTER
263
+ - Under ecommerce_crawler/commons/scripts/parquets
264
+ - Search for parquets files under ecommerce_crawler/data/SUB-FOLDER/METHOD/files
265
+ - To select domains, accept in input *one file with full path*, file must be a csv and contains a domain column
266
+ OR
267
+ - To select domains, Accept in input a *domain name part*,
268
+
269
+ - Accept as input the **-sub-folder**, to specify the relative directory
270
+ - Accept as input the **method** (urls-st0 or urls-st1 ie)
271
+
272
+ `python3 pq.py -file inputs/all_woo_shopify_3k.csv -sub-folder ALL_WOO_GB urls-st0`
273
+ `python3 pq.py -file inputs/all_woo_shopify_3k.csv -sub-folder ALL_WOO_GB urls-st1`
274
+
275
+ `python3 pq.py -name-part "keys4" -sub-folder ALL_WOO_GB urls-st1`
276
+
277
+ - Output will be saved in ecommerce_crawler/commons/scripts/parquets/outputs
278
+ - Output will be saved on DB in a table name dependent of the method, as defined in settings
@@ -0,0 +1,40 @@
1
+ LICENCE
2
+ MANIFEST.in
3
+ README.md
4
+ pyproject.toml
5
+ ispider_core/__init__.py
6
+ ispider_core/__main__.py
7
+ ispider_core/ispider.py
8
+ ispider_core/orchestrator.py
9
+ ispider_core/settings.py
10
+ ispider_core/addons/out_parser_full_to_json.py
11
+ ispider_core/crawlers/cls_controllers.py
12
+ ispider_core/crawlers/cls_queue_out.py
13
+ ispider_core/crawlers/cls_seen_filter.py
14
+ ispider_core/crawlers/http_client.py
15
+ ispider_core/crawlers/http_filters.py
16
+ ispider_core/crawlers/stage1_crawl.py
17
+ ispider_core/crawlers/stage1_crawl_helpers.py
18
+ ispider_core/crawlers/stage2_spider.py
19
+ ispider_core/crawlers/thread_queue_in.py
20
+ ispider_core/crawlers/thread_save_finished.py
21
+ ispider_core/crawlers/thread_stats.py
22
+ ispider_core/parsers/email_parser.py
23
+ ispider_core/parsers/filetype_parser.py
24
+ ispider_core/parsers/html_parser.py
25
+ ispider_core/parsers/sitemaps_parser.py
26
+ ispider_core/parsers/social_parser.py
27
+ ispider_core/storage/json_storage.py
28
+ ispider_core/utils/controllers.py
29
+ ispider_core/utils/domains.py
30
+ ispider_core/utils/efiles.py
31
+ ispider_core/utils/filters.py
32
+ ispider_core/utils/headers.py
33
+ ispider_core/utils/ifiles.py
34
+ ispider_core/utils/logger.py
35
+ ispider_core/utils/menu.py
36
+ ispider_core/utils/queues.py
37
+ ispider_core/utils/resume.py
38
+ tests/10_scrape_websites.py
39
+ tests/scrape_njorg.py
40
+ tests/test_run.py
@@ -0,0 +1 @@
1
+ from .ispider import ISpider
@@ -0,0 +1,30 @@
1
+ # ispider_core/__main__.py
2
+ import sys
3
+ from ispider_core.utils.menu import menu
4
+ from ispider_core.utils.logger import LoggerFactory
5
+
6
+ # Not used yet
7
+ if __name__ == "__main__":
8
+ args = menu()
9
+ if args.stage is None:
10
+ print("No valid stage selected. Use -h for help.")
11
+ sys.exit()
12
+
13
+ print("****** Not usable yet from command line")
14
+ print("****** Import modules instead")
15
+ quit()
16
+
17
+ if args.stage == 'stage1':
18
+ # Call stage1 function here
19
+ print("Running Stage 1...")
20
+ elif args.stage == 'stage2':
21
+ # Call stage2 function here
22
+ print("Running Stage 2...")
23
+ elif args.stage == 'stage3':
24
+ # Call stage3 function here
25
+ print("Running Stage 3...")
26
+ elif args.stage == 'stage4':
27
+ # Call stage4 function here
28
+ print("Running Stage 4...")
29
+
30
+