ohmyscrapper 0.2.3__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ohmyscrapper/__init__.py +44 -22
- ohmyscrapper/core/config.py +107 -0
- ohmyscrapper/core/config_files.py +73 -0
- ohmyscrapper/core/default_files/config.yaml +16 -0
- ohmyscrapper/core/default_files/url_sniffing.yaml +25 -0
- ohmyscrapper/core/default_files/url_types.yaml +5 -0
- ohmyscrapper/models/urls_manager.py +95 -41
- ohmyscrapper/modules/classify_urls.py +14 -6
- ohmyscrapper/modules/load_txt.py +79 -11
- ohmyscrapper/modules/process_with_ai.py +72 -36
- ohmyscrapper/modules/scrap_urls.py +130 -121
- ohmyscrapper/modules/seed.py +28 -2
- ohmyscrapper/modules/show.py +22 -14
- ohmyscrapper/modules/sniff_url.py +112 -45
- ohmyscrapper/modules/untouch_all.py +1 -1
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.7.0.dist-info}/METADATA +21 -15
- ohmyscrapper-0.7.0.dist-info/RECORD +21 -0
- ohmyscrapper-0.2.3.dist-info/RECORD +0 -16
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.7.0.dist-info}/WHEEL +0 -0
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -4,31 +4,45 @@ import time
|
|
|
4
4
|
import glob
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from urllib.parse import urlparse, urlunparse
|
|
7
|
+
from ohmyscrapper.core import config
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def get_db_dir():
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
db_folder = config.get_dir("db")
|
|
12
|
+
if not os.path.exists(db_folder):
|
|
13
|
+
os.mkdir(db_folder)
|
|
14
|
+
return db_folder
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
def get_db_path():
|
|
16
|
-
|
|
18
|
+
db_file = config.get_db()
|
|
19
|
+
return os.path.join(get_db_dir(), db_file)
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
def get_db_connection():
|
|
23
|
+
if not os.path.exists(get_db_path()):
|
|
24
|
+
create_tables(sqlite3.connect(get_db_path()))
|
|
20
25
|
return sqlite3.connect(get_db_path())
|
|
21
26
|
|
|
22
27
|
|
|
23
|
-
|
|
24
|
-
|
|
28
|
+
def use_connection(func):
|
|
29
|
+
def provide_connection(*args, **kwargs):
|
|
30
|
+
global conn
|
|
31
|
+
with get_db_connection() as conn:
|
|
32
|
+
try:
|
|
33
|
+
return func(*args, **kwargs)
|
|
34
|
+
except:
|
|
35
|
+
update_db()
|
|
36
|
+
return func(*args, **kwargs)
|
|
25
37
|
|
|
38
|
+
return provide_connection
|
|
26
39
|
|
|
27
|
-
|
|
40
|
+
|
|
41
|
+
def create_tables(conn):
|
|
28
42
|
|
|
29
43
|
c = conn.cursor()
|
|
30
44
|
c.execute(
|
|
31
|
-
"CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT,
|
|
45
|
+
"CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, title TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
|
|
32
46
|
)
|
|
33
47
|
c.execute(
|
|
34
48
|
"CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, prompt_file STRING, prompt_name STRING, created_at DATETIME)"
|
|
@@ -38,27 +52,25 @@ def create_tables():
|
|
|
38
52
|
"CREATE TABLE IF NOT EXISTS urls_valid_prefix (id INTEGER PRIMARY KEY, url_prefix TEXT UNIQUE, url_type TEXT)"
|
|
39
53
|
)
|
|
40
54
|
|
|
41
|
-
return pd.read_sql_query("SELECT * FROM urls LIMIT 100", conn)
|
|
42
55
|
|
|
56
|
+
def update_db():
|
|
57
|
+
try:
|
|
58
|
+
c = conn.cursor()
|
|
59
|
+
c.execute("ALTER TABLE urls RENAME COLUMN h1 TO title")
|
|
60
|
+
except:
|
|
61
|
+
pass
|
|
43
62
|
|
|
44
|
-
# TODO: not sure this should be something. depends on the project
|
|
45
|
-
def seeds():
|
|
46
|
-
create_tables()
|
|
47
63
|
|
|
48
|
-
|
|
49
|
-
add_urls_valid_prefix("https://lnkd.in/%", "linkedin_redirect")
|
|
50
|
-
add_urls_valid_prefix("https://%.linkedin.com/jobs/view/%", "linkedin_job")
|
|
51
|
-
add_urls_valid_prefix("https://%.linkedin.com/feed/%", "linkedin_feed")
|
|
52
|
-
add_urls_valid_prefix("https://%.linkedin.com/company/%", "linkedin_company")
|
|
64
|
+
def seeds(seeds={}):
|
|
53
65
|
|
|
54
|
-
|
|
55
|
-
|
|
66
|
+
for url_type, url_prefix in seeds.items():
|
|
67
|
+
add_urls_valid_prefix(url_prefix, url_type)
|
|
56
68
|
|
|
57
69
|
return True
|
|
58
70
|
|
|
59
71
|
|
|
72
|
+
@use_connection
|
|
60
73
|
def add_urls_valid_prefix(url_prefix, url_type):
|
|
61
|
-
conn = get_db_connection()
|
|
62
74
|
|
|
63
75
|
df = pd.read_sql_query(
|
|
64
76
|
f"SELECT * FROM urls_valid_prefix WHERE url_prefix = '{url_prefix}'", conn
|
|
@@ -72,6 +84,7 @@ def add_urls_valid_prefix(url_prefix, url_type):
|
|
|
72
84
|
conn.commit()
|
|
73
85
|
|
|
74
86
|
|
|
87
|
+
@use_connection
|
|
75
88
|
def get_urls_valid_prefix_by_type(url_type):
|
|
76
89
|
df = pd.read_sql_query(
|
|
77
90
|
f"SELECT * FROM urls_valid_prefix WHERE url_type = '{url_type}'", conn
|
|
@@ -79,12 +92,14 @@ def get_urls_valid_prefix_by_type(url_type):
|
|
|
79
92
|
return df
|
|
80
93
|
|
|
81
94
|
|
|
95
|
+
@use_connection
|
|
82
96
|
def get_urls_valid_prefix_by_id(id):
|
|
83
97
|
df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix WHERE id = '{id}'", conn)
|
|
84
98
|
return df
|
|
85
99
|
|
|
86
100
|
|
|
87
101
|
# TODO: pagination required
|
|
102
|
+
@use_connection
|
|
88
103
|
def get_urls_valid_prefix(limit=0):
|
|
89
104
|
if limit > 0:
|
|
90
105
|
df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix LIMIT {limit}", conn)
|
|
@@ -94,6 +109,7 @@ def get_urls_valid_prefix(limit=0):
|
|
|
94
109
|
|
|
95
110
|
|
|
96
111
|
# TODO: pagination required
|
|
112
|
+
@use_connection
|
|
97
113
|
def get_urls(limit=0):
|
|
98
114
|
if limit > 0:
|
|
99
115
|
df = pd.read_sql_query(
|
|
@@ -104,6 +120,7 @@ def get_urls(limit=0):
|
|
|
104
120
|
return df
|
|
105
121
|
|
|
106
122
|
|
|
123
|
+
@use_connection
|
|
107
124
|
def get_urls_report():
|
|
108
125
|
sql = """
|
|
109
126
|
WITH parent_url AS (
|
|
@@ -113,7 +130,7 @@ def get_urls_report():
|
|
|
113
130
|
SELECT
|
|
114
131
|
u.id,
|
|
115
132
|
u.url,
|
|
116
|
-
u.
|
|
133
|
+
u.title
|
|
117
134
|
FROM urls u
|
|
118
135
|
INNER JOIN parent_url p
|
|
119
136
|
ON u.url = p.parent_url
|
|
@@ -122,9 +139,9 @@ def get_urls_report():
|
|
|
122
139
|
u.id,
|
|
123
140
|
u.url_type,
|
|
124
141
|
u.url,
|
|
125
|
-
COALESCE(u.
|
|
142
|
+
COALESCE(u.title, p.title) as title,
|
|
126
143
|
p.url as parent_url,
|
|
127
|
-
p.
|
|
144
|
+
p.title as parent_title
|
|
128
145
|
FROM urls u
|
|
129
146
|
LEFT JOIN parents p
|
|
130
147
|
ON u.parent_url = p.url
|
|
@@ -138,6 +155,7 @@ def get_urls_report():
|
|
|
138
155
|
return df
|
|
139
156
|
|
|
140
157
|
|
|
158
|
+
@use_connection
|
|
141
159
|
def get_url_by_url(url):
|
|
142
160
|
url = clean_url(url)
|
|
143
161
|
df = pd.read_sql_query(f"SELECT * FROM urls WHERE url = '{url}'", conn)
|
|
@@ -145,12 +163,14 @@ def get_url_by_url(url):
|
|
|
145
163
|
return df
|
|
146
164
|
|
|
147
165
|
|
|
166
|
+
@use_connection
|
|
148
167
|
def get_url_by_id(id):
|
|
149
168
|
df = pd.read_sql_query(f"SELECT * FROM urls WHERE id = '{id}'", conn)
|
|
150
169
|
|
|
151
170
|
return df
|
|
152
171
|
|
|
153
172
|
|
|
173
|
+
@use_connection
|
|
154
174
|
def get_urls_by_url_type(url_type):
|
|
155
175
|
df = pd.read_sql_query(
|
|
156
176
|
f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}'", conn
|
|
@@ -158,6 +178,7 @@ def get_urls_by_url_type(url_type):
|
|
|
158
178
|
return df
|
|
159
179
|
|
|
160
180
|
|
|
181
|
+
@use_connection
|
|
161
182
|
def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
|
|
162
183
|
df = pd.read_sql_query(
|
|
163
184
|
f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}' AND ai_processed = 0 LIMIT {limit}",
|
|
@@ -166,6 +187,7 @@ def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
|
|
|
166
187
|
return df
|
|
167
188
|
|
|
168
189
|
|
|
190
|
+
@use_connection
|
|
169
191
|
def get_url_like_unclassified(like_condition):
|
|
170
192
|
df = pd.read_sql_query(
|
|
171
193
|
f"SELECT * FROM urls WHERE history = 0 AND url LIKE '{like_condition}' AND url_type IS NULL",
|
|
@@ -174,12 +196,13 @@ def get_url_like_unclassified(like_condition):
|
|
|
174
196
|
return df
|
|
175
197
|
|
|
176
198
|
|
|
177
|
-
|
|
199
|
+
@use_connection
|
|
200
|
+
def add_url(url, title=None, parent_url=None):
|
|
178
201
|
url = clean_url(url)
|
|
179
202
|
c = conn.cursor()
|
|
180
203
|
|
|
181
|
-
if
|
|
182
|
-
|
|
204
|
+
if title is not None:
|
|
205
|
+
title = title.strip()
|
|
183
206
|
|
|
184
207
|
if parent_url is None:
|
|
185
208
|
parent_url = None
|
|
@@ -188,14 +211,15 @@ def add_url(url, h1=None, parent_url=None):
|
|
|
188
211
|
|
|
189
212
|
if len(get_url_by_url(url)) == 0:
|
|
190
213
|
c.execute(
|
|
191
|
-
"INSERT INTO urls (url,
|
|
192
|
-
(url,
|
|
214
|
+
"INSERT INTO urls (url, title, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
|
|
215
|
+
(url, title, parent_url, int(time.time())),
|
|
193
216
|
)
|
|
194
217
|
conn.commit()
|
|
195
218
|
|
|
196
219
|
return get_url_by_url(url)
|
|
197
220
|
|
|
198
221
|
|
|
222
|
+
@use_connection
|
|
199
223
|
def add_ai_log(instructions, response, model, prompt_file, prompt_name):
|
|
200
224
|
c = conn.cursor()
|
|
201
225
|
|
|
@@ -205,10 +229,14 @@ def add_ai_log(instructions, response, model, prompt_file, prompt_name):
|
|
|
205
229
|
)
|
|
206
230
|
conn.commit()
|
|
207
231
|
|
|
232
|
+
|
|
233
|
+
@use_connection
|
|
208
234
|
def get_ai_log():
|
|
209
235
|
df = pd.read_sql_query(f"SELECT * FROM ai_log", conn)
|
|
210
236
|
return df
|
|
211
237
|
|
|
238
|
+
|
|
239
|
+
@use_connection
|
|
212
240
|
def set_url_destiny(url, destiny):
|
|
213
241
|
url = clean_url(url)
|
|
214
242
|
destiny = clean_url(destiny)
|
|
@@ -222,45 +250,62 @@ def set_url_destiny(url, destiny):
|
|
|
222
250
|
conn.commit()
|
|
223
251
|
|
|
224
252
|
|
|
225
|
-
|
|
253
|
+
@use_connection
|
|
254
|
+
def set_url_title(url, value):
|
|
226
255
|
value = str(value).strip()
|
|
227
256
|
url = clean_url(url)
|
|
228
257
|
c = conn.cursor()
|
|
229
|
-
c.execute("UPDATE urls SET
|
|
258
|
+
c.execute("UPDATE urls SET title = ? WHERE url = ?", (value, url))
|
|
230
259
|
conn.commit()
|
|
231
260
|
|
|
232
261
|
|
|
233
|
-
|
|
262
|
+
@use_connection
|
|
263
|
+
def set_url_title_by_id(id, value):
|
|
234
264
|
value = str(value).strip()
|
|
235
265
|
|
|
236
266
|
c = conn.cursor()
|
|
237
|
-
c.execute("UPDATE urls SET
|
|
267
|
+
c.execute("UPDATE urls SET title = ? WHERE id = ?", (value, id))
|
|
238
268
|
conn.commit()
|
|
239
269
|
|
|
240
270
|
|
|
271
|
+
@use_connection
|
|
241
272
|
def set_url_ai_processed_by_id(id, json_str):
|
|
242
273
|
value = 1
|
|
243
274
|
value = str(value).strip()
|
|
244
275
|
c = conn.cursor()
|
|
245
|
-
c.execute(
|
|
276
|
+
c.execute(
|
|
277
|
+
"UPDATE urls SET ai_processed = ? , json_ai = ? WHERE id = ?",
|
|
278
|
+
(value, json_str, id),
|
|
279
|
+
)
|
|
246
280
|
conn.commit()
|
|
247
281
|
|
|
282
|
+
|
|
283
|
+
@use_connection
|
|
248
284
|
def set_url_empty_ai_processed_by_id(id, json_str="empty result"):
|
|
249
285
|
value = 1
|
|
250
286
|
value = str(value).strip()
|
|
251
287
|
c = conn.cursor()
|
|
252
|
-
c.execute(
|
|
288
|
+
c.execute(
|
|
289
|
+
"UPDATE urls SET ai_processed = ? , json_ai = ? WHERE ai_processed = 0 AND id = ?",
|
|
290
|
+
(value, json_str, id),
|
|
291
|
+
)
|
|
253
292
|
conn.commit()
|
|
254
293
|
|
|
294
|
+
|
|
295
|
+
@use_connection
|
|
255
296
|
def set_url_ai_processed_by_url(url, json_str):
|
|
256
297
|
value = 1
|
|
257
298
|
value = str(value).strip()
|
|
258
299
|
url = clean_url(url)
|
|
259
300
|
c = conn.cursor()
|
|
260
|
-
c.execute(
|
|
301
|
+
c.execute(
|
|
302
|
+
"UPDATE urls SET ai_processed = ?, json_ai = ? WHERE url = ?",
|
|
303
|
+
(value, json_str, url),
|
|
304
|
+
)
|
|
261
305
|
conn.commit()
|
|
262
306
|
|
|
263
307
|
|
|
308
|
+
@use_connection
|
|
264
309
|
def set_url_description(url, value):
|
|
265
310
|
url = clean_url(url)
|
|
266
311
|
c = conn.cursor()
|
|
@@ -268,6 +313,7 @@ def set_url_description(url, value):
|
|
|
268
313
|
conn.commit()
|
|
269
314
|
|
|
270
315
|
|
|
316
|
+
@use_connection
|
|
271
317
|
def set_url_description_links(url, value):
|
|
272
318
|
url = clean_url(url)
|
|
273
319
|
c = conn.cursor()
|
|
@@ -275,6 +321,7 @@ def set_url_description_links(url, value):
|
|
|
275
321
|
conn.commit()
|
|
276
322
|
|
|
277
323
|
|
|
324
|
+
@use_connection
|
|
278
325
|
def set_url_json(url, value):
|
|
279
326
|
url = clean_url(url)
|
|
280
327
|
c = conn.cursor()
|
|
@@ -282,6 +329,7 @@ def set_url_json(url, value):
|
|
|
282
329
|
conn.commit()
|
|
283
330
|
|
|
284
331
|
|
|
332
|
+
@use_connection
|
|
285
333
|
def set_url_error(url, value):
|
|
286
334
|
url = clean_url(url)
|
|
287
335
|
c = conn.cursor()
|
|
@@ -289,6 +337,7 @@ def set_url_error(url, value):
|
|
|
289
337
|
conn.commit()
|
|
290
338
|
|
|
291
339
|
|
|
340
|
+
@use_connection
|
|
292
341
|
def set_url_type_by_id(url_id, url_type):
|
|
293
342
|
c = conn.cursor()
|
|
294
343
|
c.execute(f"UPDATE urls SET url_type = '{url_type}' WHERE id = {url_id}")
|
|
@@ -312,6 +361,7 @@ def clean_url(url):
|
|
|
312
361
|
return url
|
|
313
362
|
|
|
314
363
|
|
|
364
|
+
@use_connection
|
|
315
365
|
def get_untouched_urls(
|
|
316
366
|
limit=10, randomize=True, ignore_valid_prefix=False, only_parents=True
|
|
317
367
|
):
|
|
@@ -331,6 +381,7 @@ def get_untouched_urls(
|
|
|
331
381
|
return df
|
|
332
382
|
|
|
333
383
|
|
|
384
|
+
@use_connection
|
|
334
385
|
def touch_url(url):
|
|
335
386
|
url = clean_url(url)
|
|
336
387
|
c = conn.cursor()
|
|
@@ -338,6 +389,7 @@ def touch_url(url):
|
|
|
338
389
|
conn.commit()
|
|
339
390
|
|
|
340
391
|
|
|
392
|
+
@use_connection
|
|
341
393
|
def untouch_url(url):
|
|
342
394
|
url = clean_url(url)
|
|
343
395
|
c = conn.cursor()
|
|
@@ -345,12 +397,14 @@ def untouch_url(url):
|
|
|
345
397
|
conn.commit()
|
|
346
398
|
|
|
347
399
|
|
|
400
|
+
@use_connection
|
|
348
401
|
def untouch_all_urls():
|
|
349
402
|
c = conn.cursor()
|
|
350
403
|
c.execute("UPDATE urls SET last_touch = NULL WHERE history = 0")
|
|
351
404
|
conn.commit()
|
|
352
405
|
|
|
353
406
|
|
|
407
|
+
@use_connection
|
|
354
408
|
def set_all_urls_as_history():
|
|
355
409
|
c = conn.cursor()
|
|
356
410
|
c.execute("UPDATE urls SET history = 1")
|
|
@@ -382,19 +436,19 @@ def merge_dbs() -> None:
|
|
|
382
436
|
row["description"],
|
|
383
437
|
row["json"],
|
|
384
438
|
)
|
|
385
|
-
# ßmerge_url(df)
|
|
386
439
|
|
|
387
440
|
|
|
388
|
-
|
|
441
|
+
@use_connection
|
|
442
|
+
def merge_url(url, title, last_touch, created_at, description, json):
|
|
389
443
|
url = clean_url(url)
|
|
390
444
|
c = conn.cursor()
|
|
391
445
|
|
|
392
|
-
if
|
|
393
|
-
|
|
446
|
+
if title is not None:
|
|
447
|
+
title = title.strip()
|
|
394
448
|
|
|
395
449
|
if len(get_url_by_url(url)) == 0:
|
|
396
450
|
c.execute(
|
|
397
|
-
"INSERT INTO urls (url,
|
|
398
|
-
(url,
|
|
451
|
+
"INSERT INTO urls (url, title, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
|
|
452
|
+
(url, title, last_touch, created_at, description, json),
|
|
399
453
|
)
|
|
400
454
|
conn.commit()
|
|
@@ -1,23 +1,31 @@
|
|
|
1
1
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
from ohmyscrapper.modules import seed
|
|
2
3
|
import pandas as pd
|
|
3
4
|
import time
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def classify_urls(recursive=False):
|
|
7
|
-
urls_manager.seeds()
|
|
8
8
|
df = urls_manager.get_urls_valid_prefix()
|
|
9
|
+
if len(df) == 0:
|
|
10
|
+
seed.seed()
|
|
11
|
+
classify_urls(recursive=recursive)
|
|
12
|
+
return
|
|
9
13
|
|
|
10
14
|
keep_alive = True
|
|
11
15
|
while keep_alive:
|
|
12
|
-
print("
|
|
16
|
+
print("#️⃣ URL Classifier woke up to classify urls!")
|
|
13
17
|
for index, row_prefix in df.iterrows():
|
|
14
|
-
df_urls = urls_manager.get_url_like_unclassified(
|
|
18
|
+
df_urls = urls_manager.get_url_like_unclassified(
|
|
19
|
+
like_condition=row_prefix["url_prefix"]
|
|
20
|
+
)
|
|
15
21
|
for index, row_urls in df_urls.iterrows():
|
|
16
|
-
urls_manager.set_url_type_by_id(
|
|
22
|
+
urls_manager.set_url_type_by_id(
|
|
23
|
+
url_id=row_urls["id"], url_type=row_prefix["url_type"]
|
|
24
|
+
)
|
|
17
25
|
|
|
18
26
|
if not recursive:
|
|
19
|
-
print("
|
|
27
|
+
print("#️⃣ URL Classifier said: I'm done! See you soon...")
|
|
20
28
|
keep_alive = False
|
|
21
29
|
else:
|
|
22
|
-
print("
|
|
30
|
+
print("#️⃣ URL Classifier is taking a nap...")
|
|
23
31
|
time.sleep(10)
|
ohmyscrapper/modules/load_txt.py
CHANGED
|
@@ -1,31 +1,99 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from urlextract import URLExtract
|
|
3
3
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
4
|
+
from ohmyscrapper.core import config
|
|
4
5
|
|
|
5
6
|
|
|
6
|
-
def
|
|
7
|
+
def _increment_file_name(text_file_content, file_name):
|
|
8
|
+
print(f"reading and loading file `{file_name}`... ")
|
|
9
|
+
with open(file_name, "r") as f:
|
|
10
|
+
return text_file_content + f.read()
|
|
7
11
|
|
|
8
|
-
if not os.path.exists("input"):
|
|
9
|
-
os.mkdir("input")
|
|
10
12
|
|
|
11
|
-
|
|
13
|
+
def load_txt(file_name="input", verbose=False):
|
|
14
|
+
input_folder = config.get_dir("input")
|
|
15
|
+
if not os.path.exists(input_folder):
|
|
16
|
+
os.mkdir(input_folder)
|
|
17
|
+
|
|
12
18
|
urls_manager.seeds()
|
|
13
|
-
# make it recursive for all files
|
|
14
|
-
text_file_content = open(file_name, "r").read()
|
|
15
19
|
|
|
16
|
-
|
|
20
|
+
text_file_content = ""
|
|
21
|
+
if file_name is not None and not os.path.isdir(file_name):
|
|
22
|
+
print(f"📖 reading file `{file_name}`... ")
|
|
23
|
+
if not os.path.exists(file_name):
|
|
24
|
+
if file_name.startswith("https://") or file_name.startswith("http://"):
|
|
25
|
+
text_file_content = " " + file_name + " "
|
|
26
|
+
else:
|
|
27
|
+
print(f"\n file `{file_name}` not found.")
|
|
28
|
+
return
|
|
29
|
+
else:
|
|
30
|
+
text_file_content = _increment_file_name(
|
|
31
|
+
text_file_content=text_file_content, file_name=file_name
|
|
32
|
+
)
|
|
33
|
+
else:
|
|
34
|
+
input_folder = config.get_dir("input")
|
|
35
|
+
print(f"📂 reading {input_folder} directory... ")
|
|
36
|
+
if file_name is None:
|
|
37
|
+
dir_files = input_folder
|
|
38
|
+
else:
|
|
39
|
+
dir_files = file_name
|
|
40
|
+
text_files = os.listdir(dir_files)
|
|
41
|
+
for file in text_files:
|
|
42
|
+
if not file.endswith(".txt"):
|
|
43
|
+
text_files.remove(file)
|
|
44
|
+
if len(text_files) == 0:
|
|
45
|
+
print(f"No text files found in {input_folder} directory!")
|
|
46
|
+
return
|
|
47
|
+
elif len(text_files) == 1:
|
|
48
|
+
print(f"📖 reading file `{dir_files}/{text_files[0]}`... ")
|
|
49
|
+
text_file_content = _increment_file_name(
|
|
50
|
+
text_file_content=text_file_content,
|
|
51
|
+
file_name=os.path.join(dir_files, text_files[0]),
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
print("\nChoose a text file. Use `*` for process all and `q` to quit:")
|
|
55
|
+
for index, file in enumerate(text_files):
|
|
56
|
+
print(f"[{index}]:", os.path.join(dir_files, file))
|
|
57
|
+
|
|
58
|
+
text_file_option = -1
|
|
59
|
+
while text_file_option < 0 or text_file_option >= len(text_files):
|
|
60
|
+
text_file_option = input("Enter the file number: ")
|
|
61
|
+
if text_file_option == "*":
|
|
62
|
+
for file in text_files:
|
|
63
|
+
text_file_content = _increment_file_name(
|
|
64
|
+
text_file_content=text_file_content,
|
|
65
|
+
file_name=os.path.join(dir_files, file),
|
|
66
|
+
)
|
|
67
|
+
text_file_option = 0
|
|
68
|
+
elif text_file_option == "q":
|
|
69
|
+
return
|
|
70
|
+
elif text_file_option.isdigit():
|
|
71
|
+
text_file_option = int(text_file_option)
|
|
72
|
+
if text_file_option >= 0 and text_file_option < len(text_files):
|
|
73
|
+
text_file_content = _increment_file_name(
|
|
74
|
+
text_file_content=text_file_content,
|
|
75
|
+
file_name=os.path.join(
|
|
76
|
+
dir_files, text_files[int(text_file_option)]
|
|
77
|
+
),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
print("🔎 looking for urls...")
|
|
81
|
+
urls_found = put_urls_from_string(
|
|
82
|
+
text_to_process=text_file_content, verbose=verbose
|
|
83
|
+
)
|
|
17
84
|
|
|
18
|
-
# move_it_to_processed
|
|
19
85
|
print("--------------------")
|
|
20
|
-
print(
|
|
86
|
+
print("files processed")
|
|
87
|
+
print(f"📦 {urls_found} urls were extracted and packed into the database")
|
|
21
88
|
|
|
22
89
|
|
|
23
|
-
def put_urls_from_string(text_to_process, parent_url=None):
|
|
90
|
+
def put_urls_from_string(text_to_process, parent_url=None, verbose=False):
|
|
24
91
|
if isinstance(text_to_process, str):
|
|
25
92
|
extractor = URLExtract()
|
|
26
93
|
for url in extractor.find_urls(text_to_process):
|
|
27
94
|
urls_manager.add_url(url=url, parent_url=parent_url)
|
|
28
|
-
|
|
95
|
+
if verbose:
|
|
96
|
+
print(url, "added")
|
|
29
97
|
|
|
30
98
|
return len(extractor.find_urls(text_to_process))
|
|
31
99
|
else:
|