ohmyscrapper 0.6.1__py3-none-any.whl → 0.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ohmyscrapper/__init__.py +25 -3
- ohmyscrapper/core/config.py +18 -0
- ohmyscrapper/core/config_files.py +29 -2
- ohmyscrapper/core/default_files/config.yaml +4 -0
- ohmyscrapper/core/default_files/url_sniffing.yaml +29 -0
- ohmyscrapper/models/urls_manager.py +47 -21
- ohmyscrapper/modules/classify_urls.py +5 -1
- ohmyscrapper/modules/load_txt.py +7 -3
- ohmyscrapper/modules/process_with_ai.py +8 -8
- ohmyscrapper/modules/scrap_urls.py +106 -122
- ohmyscrapper/modules/seed.py +4 -1
- ohmyscrapper/modules/sniff_url.py +70 -37
- {ohmyscrapper-0.6.1.dist-info → ohmyscrapper-0.7.4.dist-info}/METADATA +6 -3
- ohmyscrapper-0.7.4.dist-info/RECORD +21 -0
- ohmyscrapper-0.6.1.dist-info/RECORD +0 -20
- {ohmyscrapper-0.6.1.dist-info → ohmyscrapper-0.7.4.dist-info}/WHEEL +0 -0
- {ohmyscrapper-0.6.1.dist-info → ohmyscrapper-0.7.4.dist-info}/entry_points.txt +0 -0
ohmyscrapper/__init__.py
CHANGED
|
@@ -20,7 +20,7 @@ from ohmyscrapper.core.config import update
|
|
|
20
20
|
|
|
21
21
|
def main():
|
|
22
22
|
parser = argparse.ArgumentParser(prog="ohmyscrapper")
|
|
23
|
-
parser.add_argument("--version", action="version", version="%(prog)s v0.
|
|
23
|
+
parser.add_argument("--version", action="version", version="%(prog)s v0.7.4")
|
|
24
24
|
|
|
25
25
|
update()
|
|
26
26
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
@@ -28,6 +28,9 @@ def main():
|
|
|
28
28
|
"start",
|
|
29
29
|
help="Make the entire process of 📦 loading, 🐶 scraping and 📜🖋️ exporting with the default configuration.",
|
|
30
30
|
)
|
|
31
|
+
start_parser.add_argument(
|
|
32
|
+
"-input", default=None, help="File/Folder path or url for pre-loading."
|
|
33
|
+
)
|
|
31
34
|
|
|
32
35
|
start_parser.add_argument(
|
|
33
36
|
"--ai",
|
|
@@ -50,6 +53,14 @@ def main():
|
|
|
50
53
|
help="Add all `url_types` from the bank to the `/ohmyscrapper/url_types.yaml` file.",
|
|
51
54
|
action="store_true",
|
|
52
55
|
)
|
|
56
|
+
|
|
57
|
+
seed_parser.add_argument(
|
|
58
|
+
"--reset",
|
|
59
|
+
default=False,
|
|
60
|
+
help="Reset all `url_types`.",
|
|
61
|
+
action="store_true",
|
|
62
|
+
)
|
|
63
|
+
|
|
53
64
|
untouch_parser = subparsers.add_parser(
|
|
54
65
|
"untouch-all", help="Untouch all urls. That resets classification"
|
|
55
66
|
)
|
|
@@ -85,6 +96,9 @@ def main():
|
|
|
85
96
|
scrap_urls_parser.add_argument(
|
|
86
97
|
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
87
98
|
)
|
|
99
|
+
scrap_urls_parser.add_argument(
|
|
100
|
+
"-input", default=None, help="File/Folder path or url for pre-loading."
|
|
101
|
+
)
|
|
88
102
|
|
|
89
103
|
sniff_url_parser = subparsers.add_parser("sniff-url", help="🐕 Sniff/Check url")
|
|
90
104
|
sniff_url_parser.add_argument(
|
|
@@ -131,7 +145,7 @@ def main():
|
|
|
131
145
|
if args.export:
|
|
132
146
|
export_url_types_to_file()
|
|
133
147
|
else:
|
|
134
|
-
seed()
|
|
148
|
+
seed(args.reset)
|
|
135
149
|
return
|
|
136
150
|
|
|
137
151
|
if args.command == "untouch-all":
|
|
@@ -143,6 +157,9 @@ def main():
|
|
|
143
157
|
return
|
|
144
158
|
|
|
145
159
|
if args.command == "scrap-urls":
|
|
160
|
+
if args.input != None:
|
|
161
|
+
load_txt(file_name=args.input, verbose=args.verbose)
|
|
162
|
+
|
|
146
163
|
scrap_urls(
|
|
147
164
|
recursive=args.recursive,
|
|
148
165
|
ignore_valid_prefix=args.ignore_type,
|
|
@@ -182,7 +199,12 @@ def main():
|
|
|
182
199
|
return
|
|
183
200
|
|
|
184
201
|
if args.command == "start":
|
|
185
|
-
|
|
202
|
+
seed()
|
|
203
|
+
if args.input != None:
|
|
204
|
+
load_txt(file_name=args.input)
|
|
205
|
+
else:
|
|
206
|
+
load_txt()
|
|
207
|
+
|
|
186
208
|
scrap_urls(
|
|
187
209
|
recursive=True,
|
|
188
210
|
ignore_valid_prefix=True,
|
ohmyscrapper/core/config.py
CHANGED
|
@@ -39,6 +39,12 @@ def get_ai(param):
|
|
|
39
39
|
)
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
def get_sniffing(param):
|
|
43
|
+
return config_files.get_param(
|
|
44
|
+
parent_param="sniffing", param=param, default_app_dir=default_app_dir
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
42
48
|
def load_config(force_default=False):
|
|
43
49
|
config_file_name = "config.yaml"
|
|
44
50
|
config_params = config_files.create_and_read_config_file(
|
|
@@ -67,6 +73,18 @@ def get_url_types():
|
|
|
67
73
|
)
|
|
68
74
|
|
|
69
75
|
|
|
76
|
+
def get_url_sniffing():
|
|
77
|
+
file = get_files("url_sniffing")
|
|
78
|
+
return config_files.create_and_read_config_file(
|
|
79
|
+
file, default_app_dir=default_app_dir
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def append_url_sniffing(data):
|
|
84
|
+
file = get_files("url_sniffing")
|
|
85
|
+
_append_config_file(data, file)
|
|
86
|
+
|
|
87
|
+
|
|
70
88
|
def append_url_types(url_types):
|
|
71
89
|
url_types_file = get_files("url_types")
|
|
72
90
|
_append_config_file(url_types, url_types_file)
|
|
@@ -4,14 +4,29 @@ import yaml
|
|
|
4
4
|
|
|
5
5
|
def create_and_read_config_file(file_name, default_app_dir, force_default=False):
|
|
6
6
|
config_file = config_file_path(file_name, default_app_dir)
|
|
7
|
+
default_config_params = _get_default_file(default_file=file_name)
|
|
7
8
|
if force_default or not os.path.exists(config_file):
|
|
8
|
-
config_params = _get_default_file(default_file=file_name)
|
|
9
9
|
overwrite_config_file(
|
|
10
|
-
data=
|
|
10
|
+
data=default_config_params,
|
|
11
|
+
file_name=file_name,
|
|
12
|
+
default_app_dir=default_app_dir,
|
|
11
13
|
)
|
|
14
|
+
config_params = default_config_params
|
|
12
15
|
else:
|
|
13
16
|
with open(config_file, "r") as f:
|
|
14
17
|
config_params = yaml.safe_load(f.read())
|
|
18
|
+
if complete_config_file(
|
|
19
|
+
config_params=config_params,
|
|
20
|
+
default_config_params=default_config_params,
|
|
21
|
+
file_name=file_name,
|
|
22
|
+
default_app_dir=default_app_dir,
|
|
23
|
+
):
|
|
24
|
+
config_params = create_and_read_config_file(
|
|
25
|
+
file_name=file_name,
|
|
26
|
+
default_app_dir=default_app_dir,
|
|
27
|
+
force_default=force_default,
|
|
28
|
+
)
|
|
29
|
+
|
|
15
30
|
if config_params is None:
|
|
16
31
|
config_params = create_and_read_config_file(
|
|
17
32
|
file_name=file_name, default_app_dir=default_app_dir, force_default=True
|
|
@@ -19,6 +34,18 @@ def create_and_read_config_file(file_name, default_app_dir, force_default=False)
|
|
|
19
34
|
return config_params
|
|
20
35
|
|
|
21
36
|
|
|
37
|
+
def complete_config_file(
|
|
38
|
+
config_params, default_config_params, file_name, default_app_dir
|
|
39
|
+
):
|
|
40
|
+
has_updated = False
|
|
41
|
+
for key, values in default_config_params.items():
|
|
42
|
+
if key not in config_params.keys():
|
|
43
|
+
has_updated = True
|
|
44
|
+
data = {key: values}
|
|
45
|
+
append_config_file(data, file_name, default_app_dir)
|
|
46
|
+
return has_updated
|
|
47
|
+
|
|
48
|
+
|
|
22
49
|
def overwrite_config_file(data, file_name, default_app_dir):
|
|
23
50
|
config_file = config_file_path(file_name, default_app_dir)
|
|
24
51
|
with open(config_file, "+w") as f:
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
linkedin_feed:
|
|
2
|
+
metatags:
|
|
3
|
+
og:url: url_destiny
|
|
4
|
+
|
|
5
|
+
linkedin_job:
|
|
6
|
+
bodytags:
|
|
7
|
+
h1: title
|
|
8
|
+
metatags:
|
|
9
|
+
og:title: title
|
|
10
|
+
og:description: description
|
|
11
|
+
description: description
|
|
12
|
+
|
|
13
|
+
linkedin_post:
|
|
14
|
+
bodytags:
|
|
15
|
+
h1: title
|
|
16
|
+
metatags:
|
|
17
|
+
og:title: title
|
|
18
|
+
og:description: description
|
|
19
|
+
description: description
|
|
20
|
+
|
|
21
|
+
linkedin_redirect:
|
|
22
|
+
metatags:
|
|
23
|
+
og:url: url_destiny
|
|
24
|
+
atags:
|
|
25
|
+
first-tag-as-url_destiny: 5
|
|
26
|
+
|
|
27
|
+
read_all_a_tags:
|
|
28
|
+
atags:
|
|
29
|
+
load_atags: True
|
|
@@ -29,7 +29,11 @@ def use_connection(func):
|
|
|
29
29
|
def provide_connection(*args, **kwargs):
|
|
30
30
|
global conn
|
|
31
31
|
with get_db_connection() as conn:
|
|
32
|
-
|
|
32
|
+
try:
|
|
33
|
+
return func(*args, **kwargs)
|
|
34
|
+
except:
|
|
35
|
+
update_db()
|
|
36
|
+
return func(*args, **kwargs)
|
|
33
37
|
|
|
34
38
|
return provide_connection
|
|
35
39
|
|
|
@@ -38,7 +42,7 @@ def create_tables(conn):
|
|
|
38
42
|
|
|
39
43
|
c = conn.cursor()
|
|
40
44
|
c.execute(
|
|
41
|
-
"CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT,
|
|
45
|
+
"CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, title TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
|
|
42
46
|
)
|
|
43
47
|
c.execute(
|
|
44
48
|
"CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, prompt_file STRING, prompt_name STRING, created_at DATETIME)"
|
|
@@ -49,6 +53,14 @@ def create_tables(conn):
|
|
|
49
53
|
)
|
|
50
54
|
|
|
51
55
|
|
|
56
|
+
def update_db():
|
|
57
|
+
try:
|
|
58
|
+
c = conn.cursor()
|
|
59
|
+
c.execute("ALTER TABLE urls RENAME COLUMN h1 TO title")
|
|
60
|
+
except:
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
52
64
|
def seeds(seeds={}):
|
|
53
65
|
|
|
54
66
|
for url_type, url_prefix in seeds.items():
|
|
@@ -57,6 +69,14 @@ def seeds(seeds={}):
|
|
|
57
69
|
return True
|
|
58
70
|
|
|
59
71
|
|
|
72
|
+
@use_connection
|
|
73
|
+
def reset_seeds():
|
|
74
|
+
sql = "DELETE FROM urls_valid_prefix"
|
|
75
|
+
c = conn.cursor()
|
|
76
|
+
c.execute(sql)
|
|
77
|
+
conn.commit()
|
|
78
|
+
|
|
79
|
+
|
|
60
80
|
@use_connection
|
|
61
81
|
def add_urls_valid_prefix(url_prefix, url_type):
|
|
62
82
|
|
|
@@ -118,7 +138,7 @@ def get_urls_report():
|
|
|
118
138
|
SELECT
|
|
119
139
|
u.id,
|
|
120
140
|
u.url,
|
|
121
|
-
u.
|
|
141
|
+
u.title
|
|
122
142
|
FROM urls u
|
|
123
143
|
INNER JOIN parent_url p
|
|
124
144
|
ON u.url = p.parent_url
|
|
@@ -127,9 +147,9 @@ def get_urls_report():
|
|
|
127
147
|
u.id,
|
|
128
148
|
u.url_type,
|
|
129
149
|
u.url,
|
|
130
|
-
COALESCE(u.
|
|
150
|
+
COALESCE(u.title, p.title) as title,
|
|
131
151
|
p.url as parent_url,
|
|
132
|
-
p.
|
|
152
|
+
p.title as parent_title
|
|
133
153
|
FROM urls u
|
|
134
154
|
LEFT JOIN parents p
|
|
135
155
|
ON u.parent_url = p.url
|
|
@@ -185,12 +205,14 @@ def get_url_like_unclassified(like_condition):
|
|
|
185
205
|
|
|
186
206
|
|
|
187
207
|
@use_connection
|
|
188
|
-
def add_url(url,
|
|
208
|
+
def add_url(url, title=None, parent_url=None):
|
|
209
|
+
if url[:1] == "/":
|
|
210
|
+
return
|
|
189
211
|
url = clean_url(url)
|
|
190
212
|
c = conn.cursor()
|
|
191
213
|
|
|
192
|
-
if
|
|
193
|
-
|
|
214
|
+
if title is not None:
|
|
215
|
+
title = title.strip()
|
|
194
216
|
|
|
195
217
|
if parent_url is None:
|
|
196
218
|
parent_url = None
|
|
@@ -199,8 +221,8 @@ def add_url(url, h1=None, parent_url=None):
|
|
|
199
221
|
|
|
200
222
|
if len(get_url_by_url(url)) == 0:
|
|
201
223
|
c.execute(
|
|
202
|
-
"INSERT INTO urls (url,
|
|
203
|
-
(url,
|
|
224
|
+
"INSERT INTO urls (url, title, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
|
|
225
|
+
(url, title, parent_url, int(time.time())),
|
|
204
226
|
)
|
|
205
227
|
conn.commit()
|
|
206
228
|
|
|
@@ -239,20 +261,20 @@ def set_url_destiny(url, destiny):
|
|
|
239
261
|
|
|
240
262
|
|
|
241
263
|
@use_connection
|
|
242
|
-
def
|
|
264
|
+
def set_url_title(url, value):
|
|
243
265
|
value = str(value).strip()
|
|
244
266
|
url = clean_url(url)
|
|
245
267
|
c = conn.cursor()
|
|
246
|
-
c.execute("UPDATE urls SET
|
|
268
|
+
c.execute("UPDATE urls SET title = ? WHERE url = ?", (value, url))
|
|
247
269
|
conn.commit()
|
|
248
270
|
|
|
249
271
|
|
|
250
272
|
@use_connection
|
|
251
|
-
def
|
|
273
|
+
def set_url_title_by_id(id, value):
|
|
252
274
|
value = str(value).strip()
|
|
253
275
|
|
|
254
276
|
c = conn.cursor()
|
|
255
|
-
c.execute("UPDATE urls SET
|
|
277
|
+
c.execute("UPDATE urls SET title = ? WHERE id = ?", (value, id))
|
|
256
278
|
conn.commit()
|
|
257
279
|
|
|
258
280
|
|
|
@@ -328,7 +350,9 @@ def set_url_error(url, value):
|
|
|
328
350
|
@use_connection
|
|
329
351
|
def set_url_type_by_id(url_id, url_type):
|
|
330
352
|
c = conn.cursor()
|
|
331
|
-
c.execute(
|
|
353
|
+
c.execute(
|
|
354
|
+
f"UPDATE urls SET url_type = '{url_type}', last_touch = NULL WHERE id = {url_id}"
|
|
355
|
+
)
|
|
332
356
|
conn.commit()
|
|
333
357
|
|
|
334
358
|
|
|
@@ -380,8 +404,10 @@ def touch_url(url):
|
|
|
380
404
|
@use_connection
|
|
381
405
|
def untouch_url(url):
|
|
382
406
|
url = clean_url(url)
|
|
407
|
+
url = str(url.strip())
|
|
408
|
+
|
|
383
409
|
c = conn.cursor()
|
|
384
|
-
c.execute("UPDATE urls SET last_touch = NULL WHERE url =
|
|
410
|
+
c.execute(f"UPDATE urls SET last_touch = NULL, url_type = NULL WHERE url = '{url}'")
|
|
385
411
|
conn.commit()
|
|
386
412
|
|
|
387
413
|
|
|
@@ -427,16 +453,16 @@ def merge_dbs() -> None:
|
|
|
427
453
|
|
|
428
454
|
|
|
429
455
|
@use_connection
|
|
430
|
-
def merge_url(url,
|
|
456
|
+
def merge_url(url, title, last_touch, created_at, description, json):
|
|
431
457
|
url = clean_url(url)
|
|
432
458
|
c = conn.cursor()
|
|
433
459
|
|
|
434
|
-
if
|
|
435
|
-
|
|
460
|
+
if title is not None:
|
|
461
|
+
title = title.strip()
|
|
436
462
|
|
|
437
463
|
if len(get_url_by_url(url)) == 0:
|
|
438
464
|
c.execute(
|
|
439
|
-
"INSERT INTO urls (url,
|
|
440
|
-
(url,
|
|
465
|
+
"INSERT INTO urls (url, title, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
|
|
466
|
+
(url, title, last_touch, created_at, description, json),
|
|
441
467
|
)
|
|
442
468
|
conn.commit()
|
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
from ohmyscrapper.modules import seed
|
|
2
3
|
import pandas as pd
|
|
3
4
|
import time
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def classify_urls(recursive=False):
|
|
7
|
-
urls_manager.seeds()
|
|
8
8
|
df = urls_manager.get_urls_valid_prefix()
|
|
9
|
+
if len(df) == 0:
|
|
10
|
+
seed.seed()
|
|
11
|
+
classify_urls(recursive=recursive)
|
|
12
|
+
return
|
|
9
13
|
|
|
10
14
|
keep_alive = True
|
|
11
15
|
while keep_alive:
|
ohmyscrapper/modules/load_txt.py
CHANGED
|
@@ -19,14 +19,16 @@ def load_txt(file_name="input", verbose=False):
|
|
|
19
19
|
|
|
20
20
|
text_file_content = ""
|
|
21
21
|
if file_name is not None and not os.path.isdir(file_name):
|
|
22
|
-
print(f"📖 reading file `{file_name}`... ")
|
|
23
22
|
if not os.path.exists(file_name):
|
|
24
23
|
if file_name.startswith("https://") or file_name.startswith("http://"):
|
|
24
|
+
print(f"📖 reading url `{file_name}`... ")
|
|
25
25
|
text_file_content = " " + file_name + " "
|
|
26
|
+
urls_manager.untouch_url(url=file_name)
|
|
26
27
|
else:
|
|
27
28
|
print(f"\n file `{file_name}` not found.")
|
|
28
29
|
return
|
|
29
30
|
else:
|
|
31
|
+
print(f"📖 reading file `{file_name}`... ")
|
|
30
32
|
text_file_content = _increment_file_name(
|
|
31
33
|
text_file_content=text_file_content, file_name=file_name
|
|
32
34
|
)
|
|
@@ -51,13 +53,15 @@ def load_txt(file_name="input", verbose=False):
|
|
|
51
53
|
file_name=os.path.join(dir_files, text_files[0]),
|
|
52
54
|
)
|
|
53
55
|
else:
|
|
54
|
-
print("\
|
|
56
|
+
print("\nFiles list:")
|
|
55
57
|
for index, file in enumerate(text_files):
|
|
56
58
|
print(f"[{index}]:", os.path.join(dir_files, file))
|
|
57
59
|
|
|
58
60
|
text_file_option = -1
|
|
59
61
|
while text_file_option < 0 or text_file_option >= len(text_files):
|
|
60
|
-
text_file_option = input(
|
|
62
|
+
text_file_option = input(
|
|
63
|
+
"Choose a text file. Use `*` for process all and `q` to quit. Enter the file number: "
|
|
64
|
+
)
|
|
61
65
|
if text_file_option == "*":
|
|
62
66
|
for file in text_files:
|
|
63
67
|
text_file_content = _increment_file_name(
|
|
@@ -28,13 +28,13 @@ def process_ai_response(response):
|
|
|
28
28
|
url_parent = urls_manager.get_url_by_id(url_child_xml["id"])
|
|
29
29
|
if len(url_parent) > 0:
|
|
30
30
|
url_parent = url_parent.iloc[0]
|
|
31
|
-
|
|
32
|
-
del
|
|
33
|
-
del
|
|
34
|
-
|
|
31
|
+
title = url_child_xml.copy()
|
|
32
|
+
del title["id"]
|
|
33
|
+
del title["url"]
|
|
34
|
+
title = " - ".join(title.values())
|
|
35
35
|
if url_parent["description_links"] > 1 and url_child_xml["id"] != "":
|
|
36
|
-
print("-- child updated -- \n", url_child_xml["url"], ":",
|
|
37
|
-
urls_manager.
|
|
36
|
+
print("-- child updated -- \n", url_child_xml["url"], ":", title)
|
|
37
|
+
urls_manager.set_url_title(url_child_xml["url"], title)
|
|
38
38
|
urls_manager.set_url_ai_processed_by_url(
|
|
39
39
|
url_child_xml["url"], str(json.dumps(url_child_xml))
|
|
40
40
|
)
|
|
@@ -43,8 +43,8 @@ def process_ai_response(response):
|
|
|
43
43
|
url_parent["url"], "children-update"
|
|
44
44
|
)
|
|
45
45
|
else:
|
|
46
|
-
print("-- parent updated -- \n", url_parent["url"], ":",
|
|
47
|
-
urls_manager.
|
|
46
|
+
print("-- parent updated -- \n", url_parent["url"], ":", title)
|
|
47
|
+
urls_manager.set_url_title(url_parent["url"], title)
|
|
48
48
|
urls_manager.set_url_ai_processed_by_url(
|
|
49
49
|
url_parent["url"], str(json.dumps(url_child_xml))
|
|
50
50
|
)
|
|
@@ -2,154 +2,138 @@ import ohmyscrapper.models.urls_manager as urls_manager
|
|
|
2
2
|
import ohmyscrapper.modules.sniff_url as sniff_url
|
|
3
3
|
import ohmyscrapper.modules.load_txt as load_txt
|
|
4
4
|
import ohmyscrapper.modules.classify_urls as classify_urls
|
|
5
|
+
from ohmyscrapper.core import config
|
|
5
6
|
|
|
6
7
|
import time
|
|
7
8
|
import random
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
def
|
|
11
|
-
if
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
if url_report["total-a-links"] < 5:
|
|
15
|
-
if "first-a-link" in url_report.keys():
|
|
16
|
-
url_destiny = url_report["first-a-link"]
|
|
17
|
-
else:
|
|
18
|
-
urls_manager.set_url_error(url=url["url"], value="error: no first-a-link")
|
|
19
|
-
if verbose:
|
|
20
|
-
print("no url for:", url["url"])
|
|
21
|
-
return
|
|
22
|
-
else:
|
|
23
|
-
if "og:url" in url_report.keys():
|
|
24
|
-
url_destiny = url_report["og:url"]
|
|
25
|
-
else:
|
|
26
|
-
urls_manager.set_url_error(url=url["url"], value="error: no og:url")
|
|
27
|
-
if verbose:
|
|
28
|
-
print("no url for:", url["url"])
|
|
29
|
-
return
|
|
30
|
-
if verbose:
|
|
31
|
-
print(url["url"], ">>", url_destiny)
|
|
32
|
-
urls_manager.add_url(url=url_destiny)
|
|
33
|
-
urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
|
|
34
|
-
|
|
11
|
+
def scrap_url(url, verbose=False):
|
|
12
|
+
if url["url_type"] is None:
|
|
13
|
+
url["url_type"] = "generic"
|
|
35
14
|
|
|
36
|
-
def process_linkedin_feed(url_report, url, verbose=False):
|
|
37
15
|
if verbose:
|
|
38
|
-
print("
|
|
16
|
+
print("\n\n", url["url_type"] + ":", url["url"])
|
|
39
17
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
18
|
+
try:
|
|
19
|
+
url_type = url["url_type"]
|
|
20
|
+
sniffing_config = config.get_url_sniffing()
|
|
21
|
+
|
|
22
|
+
if url_type not in sniffing_config:
|
|
23
|
+
default_type_sniffing = {
|
|
24
|
+
"bodytags": {"h1": "title"},
|
|
25
|
+
"metatags": {
|
|
26
|
+
"og:title": "title",
|
|
27
|
+
"og:description": "description",
|
|
28
|
+
"description": "description",
|
|
29
|
+
},
|
|
30
|
+
}
|
|
31
|
+
config.append_url_sniffing({url_type: default_type_sniffing})
|
|
32
|
+
sniffing_config = config.get_url_sniffing()
|
|
33
|
+
|
|
34
|
+
url_report = sniff_url.get_tags(
|
|
35
|
+
url=url["url"], sniffing_config=sniffing_config[url_type]
|
|
36
|
+
)
|
|
37
|
+
except Exception as e:
|
|
38
|
+
urls_manager.set_url_error(url=url["url"], value="error on scrapping")
|
|
39
|
+
urls_manager.touch_url(url=url["url"])
|
|
44
40
|
if verbose:
|
|
45
|
-
print("
|
|
41
|
+
print("\n\n!!! ERROR FOR:", url["url"])
|
|
42
|
+
print(
|
|
43
|
+
"\n\n!!! you can check the URL using the command sniff-url",
|
|
44
|
+
url["url"],
|
|
45
|
+
"\n\n",
|
|
46
|
+
)
|
|
46
47
|
return
|
|
47
48
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
49
|
+
process_sniffed_url(
|
|
50
|
+
url_report=url_report,
|
|
51
|
+
url=url,
|
|
52
|
+
sniffing_config=sniffing_config[url_type],
|
|
53
|
+
verbose=verbose,
|
|
54
|
+
)
|
|
53
55
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
print("linkedin_job")
|
|
57
|
-
changed = False
|
|
58
|
-
if "h1" in url_report.keys():
|
|
59
|
-
if verbose:
|
|
60
|
-
print(url["url"], ": ", url_report["h1"])
|
|
61
|
-
urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
|
|
62
|
-
changed = True
|
|
63
|
-
elif "og:title" in url_report.keys():
|
|
64
|
-
if verbose:
|
|
65
|
-
print(url["url"], ": ", url_report["og:title"])
|
|
66
|
-
urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
|
|
67
|
-
changed = True
|
|
56
|
+
urls_manager.set_url_json(url=url["url"], value=url_report["json"])
|
|
57
|
+
urls_manager.touch_url(url=url["url"])
|
|
68
58
|
|
|
69
|
-
|
|
70
|
-
urls_manager.set_url_description(
|
|
71
|
-
url=url["url"], value=url_report["description"]
|
|
72
|
-
)
|
|
73
|
-
changed = True
|
|
74
|
-
elif "og:description" in url_report.keys():
|
|
75
|
-
urls_manager.set_url_description(
|
|
76
|
-
url=url["url"], value=url_report["og:description"]
|
|
77
|
-
)
|
|
78
|
-
changed = True
|
|
79
|
-
if not changed:
|
|
80
|
-
urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
|
|
59
|
+
return
|
|
81
60
|
|
|
82
61
|
|
|
83
|
-
def
|
|
62
|
+
def process_sniffed_url(url_report, url, sniffing_config, verbose=False):
|
|
84
63
|
if verbose:
|
|
85
|
-
print("
|
|
64
|
+
print(url["url_type"])
|
|
86
65
|
print(url["url"])
|
|
87
66
|
changed = False
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
67
|
+
|
|
68
|
+
db_fields = {}
|
|
69
|
+
db_fields["title"] = None
|
|
70
|
+
db_fields["description"] = None
|
|
71
|
+
db_fields["url_destiny"] = None
|
|
72
|
+
|
|
73
|
+
if "metatags" in sniffing_config.keys():
|
|
74
|
+
for tag, bd_field in sniffing_config["metatags"].items():
|
|
75
|
+
if tag in url_report.keys():
|
|
76
|
+
if bd_field[:1] == "+":
|
|
77
|
+
if db_fields[bd_field[1:]] is None:
|
|
78
|
+
db_fields[bd_field[1:]] = ""
|
|
79
|
+
db_fields[bd_field[1:]] = (
|
|
80
|
+
db_fields[bd_field[1:]] + " " + url_report[tag]
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
db_fields[bd_field] = url_report[tag]
|
|
84
|
+
|
|
85
|
+
if "bodytags" in sniffing_config.keys():
|
|
86
|
+
for tag, bd_field in sniffing_config["bodytags"].items():
|
|
87
|
+
if tag in url_report.keys():
|
|
88
|
+
if bd_field[:1] == "+":
|
|
89
|
+
if db_fields[bd_field[1:]] is None:
|
|
90
|
+
db_fields[bd_field[1:]] = ""
|
|
91
|
+
db_fields[bd_field[1:]] = (
|
|
92
|
+
db_fields[bd_field[1:]] + " " + url_report[tag]
|
|
93
|
+
)
|
|
94
|
+
else:
|
|
95
|
+
db_fields[bd_field] = url_report[tag]
|
|
96
|
+
|
|
97
|
+
if (
|
|
98
|
+
"atags" in sniffing_config.keys()
|
|
99
|
+
and "first-tag-as-url_destiny" in sniffing_config["atags"].keys()
|
|
100
|
+
):
|
|
101
|
+
if (
|
|
102
|
+
url_report["total-a-links"]
|
|
103
|
+
< sniffing_config["atags"]["first-tag-as-url_destiny"]
|
|
104
|
+
):
|
|
105
|
+
if "first-a-link" in url_report.keys():
|
|
106
|
+
db_fields["url_destiny"] = url_report["first-a-link"]
|
|
107
|
+
if (
|
|
108
|
+
"atags" in sniffing_config.keys()
|
|
109
|
+
and "load_links" in sniffing_config["atags"].keys()
|
|
110
|
+
):
|
|
111
|
+
for a_link in url_report["a_links"]:
|
|
112
|
+
urls_manager.add_url(url=a_link["href"], parent_url=url["url"])
|
|
113
|
+
|
|
114
|
+
if db_fields["title"] is not None:
|
|
115
|
+
urls_manager.set_url_title(url=url["url"], value=db_fields["title"])
|
|
102
116
|
changed = True
|
|
103
117
|
|
|
104
|
-
if description is not None:
|
|
105
|
-
urls_manager.set_url_description(url=url["url"], value=description)
|
|
118
|
+
if db_fields["description"] is not None:
|
|
119
|
+
urls_manager.set_url_description(url=url["url"], value=db_fields["description"])
|
|
106
120
|
description_links = load_txt.put_urls_from_string(
|
|
107
|
-
text_to_process=description, parent_url=url["url"]
|
|
121
|
+
text_to_process=db_fields["description"], parent_url=url["url"]
|
|
108
122
|
)
|
|
109
123
|
urls_manager.set_url_description_links(url=url["url"], value=description_links)
|
|
110
124
|
|
|
111
|
-
|
|
112
|
-
urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
def scrap_url(url, verbose=False):
|
|
116
|
-
# TODO: Need to change this
|
|
117
|
-
|
|
118
|
-
if url["url_type"] is None:
|
|
119
|
-
if verbose:
|
|
120
|
-
print("\n\ngeneric:", url["url"])
|
|
121
|
-
url["url_type"] = "generic"
|
|
122
|
-
else:
|
|
123
|
-
if verbose:
|
|
124
|
-
print("\n\n", url["url_type"] + ":", url["url"])
|
|
125
|
-
try:
|
|
126
|
-
url_report = sniff_url.get_tags(url=url["url"])
|
|
127
|
-
except Exception as e:
|
|
128
|
-
urls_manager.set_url_error(url=url["url"], value="error")
|
|
129
|
-
urls_manager.touch_url(url=url["url"])
|
|
130
|
-
if verbose:
|
|
131
|
-
print("\n\n!!! ERROR FOR:", url["url"])
|
|
132
|
-
print(
|
|
133
|
-
"\n\n!!! you can check the URL using the command sniff-url",
|
|
134
|
-
url["url"],
|
|
135
|
-
"\n\n",
|
|
136
|
-
)
|
|
137
|
-
return
|
|
138
|
-
|
|
139
|
-
if url["url_type"] == "linkedin_redirect":
|
|
140
|
-
process_linkedin_redirect(url_report=url_report, url=url, verbose=verbose)
|
|
141
|
-
|
|
142
|
-
if url["url_type"] == "linkedin_feed":
|
|
143
|
-
process_linkedin_feed(url_report=url_report, url=url, verbose=verbose)
|
|
144
|
-
|
|
145
|
-
if url["url_type"] == "linkedin_job":
|
|
146
|
-
process_linkedin_job(url_report=url_report, url=url, verbose=verbose)
|
|
125
|
+
changed = True
|
|
147
126
|
|
|
148
|
-
if
|
|
149
|
-
|
|
127
|
+
if db_fields["url_destiny"] is not None:
|
|
128
|
+
urls_manager.add_url(url=db_fields["url_destiny"])
|
|
129
|
+
urls_manager.set_url_destiny(url=url["url"], destiny=db_fields["url_destiny"])
|
|
130
|
+
changed = True
|
|
150
131
|
|
|
151
|
-
|
|
152
|
-
|
|
132
|
+
if not changed:
|
|
133
|
+
urls_manager.set_url_error(
|
|
134
|
+
url=url["url"],
|
|
135
|
+
value="error: no title, url_destiny or description was founded",
|
|
136
|
+
)
|
|
153
137
|
|
|
154
138
|
|
|
155
139
|
def isNaN(num):
|
ohmyscrapper/modules/seed.py
CHANGED
|
@@ -2,7 +2,10 @@ import ohmyscrapper.models.urls_manager as urls_manager
|
|
|
2
2
|
from ohmyscrapper.core import config
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
def seed():
|
|
5
|
+
def seed(reset=False):
|
|
6
|
+
if reset:
|
|
7
|
+
urls_manager.reset_seeds()
|
|
8
|
+
|
|
6
9
|
if not config.url_types_file_exists():
|
|
7
10
|
db_url_types = urls_manager.get_urls_valid_prefix()
|
|
8
11
|
if len(db_url_types) > 0:
|
|
@@ -1,41 +1,74 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
from bs4 import BeautifulSoup
|
|
3
3
|
import json
|
|
4
|
+
from ohmyscrapper.core import config
|
|
4
5
|
|
|
5
6
|
|
|
6
|
-
def sniff_url(
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
def sniff_url(
|
|
8
|
+
url="https://www.linkedin.com/in/cesardesouzacardoso/",
|
|
9
|
+
silent=False,
|
|
10
|
+
sniffing_config={},
|
|
11
|
+
):
|
|
12
|
+
final_report = {}
|
|
13
|
+
final_report["error"] = None
|
|
14
|
+
if "metatags" in sniffing_config:
|
|
15
|
+
metatags_to_search = sniffing_config["metatags"]
|
|
16
|
+
else:
|
|
17
|
+
metatags_to_search = [
|
|
18
|
+
"description",
|
|
19
|
+
"og:url",
|
|
20
|
+
"og:title",
|
|
21
|
+
"og:description",
|
|
22
|
+
"og:type",
|
|
23
|
+
"lnkd:url",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
if "bodytags" in sniffing_config:
|
|
27
|
+
body_tags_to_search = sniffing_config["bodytags"]
|
|
28
|
+
else:
|
|
29
|
+
body_tags_to_search = {
|
|
30
|
+
"h1": "",
|
|
31
|
+
"h2": "",
|
|
32
|
+
}
|
|
9
33
|
|
|
10
|
-
|
|
11
|
-
|
|
34
|
+
if type(metatags_to_search) is dict:
|
|
35
|
+
metatags_to_search = list(metatags_to_search.keys())
|
|
12
36
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
"og:title",
|
|
17
|
-
"og:description",
|
|
18
|
-
"og:type",
|
|
19
|
-
"lnkd:url",
|
|
20
|
-
]
|
|
37
|
+
# force clean concatenate without any separator
|
|
38
|
+
if type(body_tags_to_search) is dict:
|
|
39
|
+
body_tags_to_search = list(body_tags_to_search.keys())
|
|
21
40
|
|
|
22
|
-
|
|
23
|
-
"
|
|
24
|
-
|
|
25
|
-
|
|
41
|
+
if type(body_tags_to_search) is list:
|
|
42
|
+
body_tags_to_search = dict.fromkeys(body_tags_to_search, " ")
|
|
43
|
+
|
|
44
|
+
if not silent:
|
|
45
|
+
print("checking url:", url)
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
r = requests.get(url=url, timeout=config.get_sniffing("timeout"))
|
|
49
|
+
soup = BeautifulSoup(r.text, "html.parser")
|
|
50
|
+
except requests.exceptions.ReadTimeout:
|
|
51
|
+
url_domain = url.split("/")[2]
|
|
52
|
+
final_report["error"] = (
|
|
53
|
+
f"!!! timeout (10 seconds) while checking the url with domain: `{url_domain}` !!!"
|
|
54
|
+
)
|
|
55
|
+
print(f"\n\n{final_report['error']}\n\n")
|
|
56
|
+
soup = BeautifulSoup("", "html.parser")
|
|
26
57
|
|
|
27
|
-
final_report = {}
|
|
28
58
|
final_report["scrapped-url"] = url
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
59
|
+
if len(metatags_to_search) > 0:
|
|
60
|
+
final_report.update(
|
|
61
|
+
_extract_meta_tags(
|
|
62
|
+
soup=soup, silent=silent, metatags_to_search=metatags_to_search
|
|
63
|
+
)
|
|
32
64
|
)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
65
|
+
|
|
66
|
+
if len(body_tags_to_search) > 0:
|
|
67
|
+
final_report.update(
|
|
68
|
+
_extract_text_tags(
|
|
69
|
+
soup=soup, silent=silent, body_tags_to_search=body_tags_to_search
|
|
70
|
+
)
|
|
37
71
|
)
|
|
38
|
-
)
|
|
39
72
|
final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent)
|
|
40
73
|
final_report = _complementary_report(final_report, soup, silent).copy()
|
|
41
74
|
final_report["json"] = json.dumps(final_report)
|
|
@@ -85,24 +118,24 @@ def _extract_meta_tags(soup, silent, metatags_to_search):
|
|
|
85
118
|
return valid_meta_tags
|
|
86
119
|
|
|
87
120
|
|
|
88
|
-
def _extract_text_tags(soup, silent,
|
|
121
|
+
def _extract_text_tags(soup, silent, body_tags_to_search):
|
|
89
122
|
valid_text_tags = {}
|
|
90
123
|
if not silent:
|
|
91
124
|
print("\n\n\n\n---- all <text> tags ---\n")
|
|
92
125
|
i = 0
|
|
93
|
-
for text_tag, separator in
|
|
126
|
+
for text_tag, separator in body_tags_to_search.items():
|
|
94
127
|
if len(soup.find_all(text_tag)) > 0:
|
|
95
128
|
valid_text_tags[text_tag] = []
|
|
96
129
|
for obj_tag in soup.find_all(text_tag):
|
|
97
130
|
valid_text_tags[text_tag].append(obj_tag.text.strip())
|
|
98
131
|
valid_text_tags[text_tag] = separator.join(valid_text_tags[text_tag])
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
132
|
+
i = i + 1
|
|
133
|
+
if not silent:
|
|
134
|
+
print("-- text tag", i, "--")
|
|
135
|
+
print("name:", text_tag)
|
|
136
|
+
print("separator:", separator)
|
|
137
|
+
print("texts:", valid_text_tags[text_tag])
|
|
138
|
+
print("---------------- \n")
|
|
106
139
|
return valid_text_tags
|
|
107
140
|
|
|
108
141
|
|
|
@@ -128,5 +161,5 @@ def _complementary_report(final_report, soup, silent):
|
|
|
128
161
|
return final_report
|
|
129
162
|
|
|
130
163
|
|
|
131
|
-
def get_tags(url):
|
|
132
|
-
return sniff_url(url=url, silent=True)
|
|
164
|
+
def get_tags(url, sniffing_config={}):
|
|
165
|
+
return sniff_url(url=url, silent=True, sniffing_config=sniffing_config)
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ohmyscrapper
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.4
|
|
4
4
|
Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
|
|
5
5
|
Author: Cesar Cardoso
|
|
6
6
|
Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
|
|
7
|
+
License-Expression: MIT
|
|
7
8
|
Requires-Dist: beautifulsoup4>=4.14.3
|
|
8
9
|
Requires-Dist: google-genai>=1.55.0
|
|
9
10
|
Requires-Dist: markdown>=3.10
|
|
@@ -14,9 +15,11 @@ Requires-Dist: requests>=2.32.5
|
|
|
14
15
|
Requires-Dist: rich>=14.2.0
|
|
15
16
|
Requires-Dist: urlextract>=1.9.0
|
|
16
17
|
Requires-Python: >=3.11
|
|
18
|
+
Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
|
|
19
|
+
Project-URL: Repository, https://github.com/bouli/ohmyscrapper
|
|
17
20
|
Description-Content-Type: text/markdown
|
|
18
21
|
|
|
19
|
-
# 🐶 OhMyScrapper - v0.
|
|
22
|
+
# 🐶 OhMyScrapper - v0.7.4
|
|
20
23
|
|
|
21
24
|
OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
|
|
22
25
|
final report with general information about job positions.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
ohmyscrapper/__init__.py,sha256=x3wLMhIU744W9DRtXoTrPpWghb7UdC3UJSYZh_gpzlw,7095
|
|
2
|
+
ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
|
|
3
|
+
ohmyscrapper/core/config.py,sha256=aaSLxk6Fuzp88EMax6MAOX3WszH4OfYLz_dJoXlu0ME,3142
|
|
4
|
+
ohmyscrapper/core/config_files.py,sha256=C79-Vgz1E5_jUWtob-yrCyBxsqWEXxqPI_r6TL7D1_Q,3314
|
|
5
|
+
ohmyscrapper/core/default_files/config.yaml,sha256=gi8tqhSumQYJIl8QDisJ6eaib2tdcBNT-GFU-e6Dtns,273
|
|
6
|
+
ohmyscrapper/core/default_files/url_sniffing.yaml,sha256=RU5GYWmC1PdBl4nn7HUfRBwuXz8Rlap75d4W3zWDzPM,465
|
|
7
|
+
ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
|
|
8
|
+
ohmyscrapper/models/urls_manager.py,sha256=k0N1If4YoRUWHX80OyBNEeJNIzDROc2ur6j8q2OBlqo,12103
|
|
9
|
+
ohmyscrapper/modules/classify_urls.py,sha256=GhiosAQUITy1DQe_PksYV9QRKVTgpkSE28dkutzbWVA,1038
|
|
10
|
+
ohmyscrapper/modules/load_txt.py,sha256=pkWBIdh6vORPfENDZ6wGM89vswnOnc1flqKfkLs9RD8,4138
|
|
11
|
+
ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
|
|
12
|
+
ohmyscrapper/modules/process_with_ai.py,sha256=kl39Jzl-PUwh6AfmTZ9SLFUYs9Sk4biqgt8rNz3X1FA,7255
|
|
13
|
+
ohmyscrapper/modules/scrap_urls.py,sha256=uN5j0dychVMGu7n1rcpYdba4sqc47ssyCn0tVaiz-Ic,6264
|
|
14
|
+
ohmyscrapper/modules/seed.py,sha256=hHEGSoPXsmclTaRPeIcK2oC1Xpg3_JqBv_YFMD0m5Jw,1044
|
|
15
|
+
ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
|
|
16
|
+
ohmyscrapper/modules/sniff_url.py,sha256=1QnxEdCWLjLh0uM72dlPzst64qglqg2MHA_xYlNcLSA,5435
|
|
17
|
+
ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
|
|
18
|
+
ohmyscrapper-0.7.4.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
|
|
19
|
+
ohmyscrapper-0.7.4.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
|
|
20
|
+
ohmyscrapper-0.7.4.dist-info/METADATA,sha256=CVE8WUcraUtONy9UVIU0y8Y7wjsk4zEmMVfpA_al1CU,4261
|
|
21
|
+
ohmyscrapper-0.7.4.dist-info/RECORD,,
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
ohmyscrapper/__init__.py,sha256=TGOizxll-06nyJdYSM8SRUccQ5Xhv6dDNW6sIbuH0Mk,6493
|
|
2
|
-
ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
|
|
3
|
-
ohmyscrapper/core/config.py,sha256=_me0T6IQqz7bA6Kh6IofNrb-o-07nipcLozUuPrz0l4,2722
|
|
4
|
-
ohmyscrapper/core/config_files.py,sha256=KC3yChTnlclclU9EKTqFBoAu9p6XdOKuegub5NPYDDY,2434
|
|
5
|
-
ohmyscrapper/core/default_files/config.yaml,sha256=9nMOhnnJUcZudXUq5WBEXCCgezfUKI3m4azIuSch_wQ,214
|
|
6
|
-
ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
|
|
7
|
-
ohmyscrapper/models/urls_manager.py,sha256=93WvHnk89hA2BfJfDsD2JlZBeRxo2T_F3FfypiRKKHs,11523
|
|
8
|
-
ohmyscrapper/modules/classify_urls.py,sha256=4rt7_iPDcCGHhJg-f75wBfFmvjdvQj1xFFP-if_IeFM,926
|
|
9
|
-
ohmyscrapper/modules/load_txt.py,sha256=dNkUZ2ehBiPx-q4fPczRiHFvnpzCrjeycFtexhWGmEE,3967
|
|
10
|
-
ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
|
|
11
|
-
ohmyscrapper/modules/process_with_ai.py,sha256=Th-HMJzQYGQ4UBG8AGFsF5cCKIa1HlPATfmGLTTAE24,7222
|
|
12
|
-
ohmyscrapper/modules/scrap_urls.py,sha256=dxpvPyJWtmQj1vZ6IgnhcICWw1eOxYOeplDfZzDTLw4,6864
|
|
13
|
-
ohmyscrapper/modules/seed.py,sha256=qDUE7TWx9iNQEzqThK4p7g8pTZjdpkmoqI8kOo_zdtk,983
|
|
14
|
-
ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
|
|
15
|
-
ohmyscrapper/modules/sniff_url.py,sha256=dF6Nv54TC1Si-FRyqtw4V2WNk3NqaJ1h_PzwZm3UNzk,4126
|
|
16
|
-
ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
|
|
17
|
-
ohmyscrapper-0.6.1.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
|
|
18
|
-
ohmyscrapper-0.6.1.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
|
|
19
|
-
ohmyscrapper-0.6.1.dist-info/METADATA,sha256=k06ZCfkLkDuy_GvCj6jAFq2xfCUA5gN8cVlDH-2Q6Bs,4096
|
|
20
|
-
ohmyscrapper-0.6.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|