ohmyscrapper 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ohmyscrapper/__init__.py +1 -1
- ohmyscrapper/core/config.py +12 -0
- ohmyscrapper/core/default_files/config.yaml +1 -0
- ohmyscrapper/core/default_files/url_sniffing.yaml +25 -0
- ohmyscrapper/models/urls_manager.py +31 -19
- ohmyscrapper/modules/classify_urls.py +5 -1
- ohmyscrapper/modules/process_with_ai.py +8 -8
- ohmyscrapper/modules/scrap_urls.py +100 -122
- ohmyscrapper/modules/sniff_url.py +50 -27
- {ohmyscrapper-0.6.1.dist-info → ohmyscrapper-0.7.0.dist-info}/METADATA +2 -2
- ohmyscrapper-0.7.0.dist-info/RECORD +21 -0
- ohmyscrapper-0.6.1.dist-info/RECORD +0 -20
- {ohmyscrapper-0.6.1.dist-info → ohmyscrapper-0.7.0.dist-info}/WHEEL +0 -0
- {ohmyscrapper-0.6.1.dist-info → ohmyscrapper-0.7.0.dist-info}/entry_points.txt +0 -0
ohmyscrapper/__init__.py
CHANGED
|
@@ -20,7 +20,7 @@ from ohmyscrapper.core.config import update
|
|
|
20
20
|
|
|
21
21
|
def main():
|
|
22
22
|
parser = argparse.ArgumentParser(prog="ohmyscrapper")
|
|
23
|
-
parser.add_argument("--version", action="version", version="%(prog)s v0.
|
|
23
|
+
parser.add_argument("--version", action="version", version="%(prog)s v0.7.0")
|
|
24
24
|
|
|
25
25
|
update()
|
|
26
26
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
ohmyscrapper/core/config.py
CHANGED
|
@@ -67,6 +67,18 @@ def get_url_types():
|
|
|
67
67
|
)
|
|
68
68
|
|
|
69
69
|
|
|
70
|
+
def get_url_sniffing():
|
|
71
|
+
file = get_files("url_sniffing")
|
|
72
|
+
return config_files.create_and_read_config_file(
|
|
73
|
+
file, default_app_dir=default_app_dir
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def append_url_sniffing(data):
|
|
78
|
+
file = get_files("url_sniffing")
|
|
79
|
+
_append_config_file(data, file)
|
|
80
|
+
|
|
81
|
+
|
|
70
82
|
def append_url_types(url_types):
|
|
71
83
|
url_types_file = get_files("url_types")
|
|
72
84
|
_append_config_file(url_types, url_types_file)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
linkedin_feed:
|
|
2
|
+
metatags:
|
|
3
|
+
og:url: url_destiny
|
|
4
|
+
|
|
5
|
+
linkedin_job:
|
|
6
|
+
bodytags:
|
|
7
|
+
h1: title
|
|
8
|
+
metatags:
|
|
9
|
+
og:title: title
|
|
10
|
+
og:description: description
|
|
11
|
+
description: description
|
|
12
|
+
|
|
13
|
+
linkedin_post:
|
|
14
|
+
bodytags:
|
|
15
|
+
h1: title
|
|
16
|
+
metatags:
|
|
17
|
+
og:title: title
|
|
18
|
+
og:description: description
|
|
19
|
+
description: description
|
|
20
|
+
|
|
21
|
+
linkedin_redirect:
|
|
22
|
+
metatags:
|
|
23
|
+
og:url: url_destiny
|
|
24
|
+
atags:
|
|
25
|
+
first-tag-as-url_destiny: 5
|
|
@@ -29,7 +29,11 @@ def use_connection(func):
|
|
|
29
29
|
def provide_connection(*args, **kwargs):
|
|
30
30
|
global conn
|
|
31
31
|
with get_db_connection() as conn:
|
|
32
|
-
|
|
32
|
+
try:
|
|
33
|
+
return func(*args, **kwargs)
|
|
34
|
+
except:
|
|
35
|
+
update_db()
|
|
36
|
+
return func(*args, **kwargs)
|
|
33
37
|
|
|
34
38
|
return provide_connection
|
|
35
39
|
|
|
@@ -38,7 +42,7 @@ def create_tables(conn):
|
|
|
38
42
|
|
|
39
43
|
c = conn.cursor()
|
|
40
44
|
c.execute(
|
|
41
|
-
"CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT,
|
|
45
|
+
"CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, title TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
|
|
42
46
|
)
|
|
43
47
|
c.execute(
|
|
44
48
|
"CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, prompt_file STRING, prompt_name STRING, created_at DATETIME)"
|
|
@@ -49,6 +53,14 @@ def create_tables(conn):
|
|
|
49
53
|
)
|
|
50
54
|
|
|
51
55
|
|
|
56
|
+
def update_db():
|
|
57
|
+
try:
|
|
58
|
+
c = conn.cursor()
|
|
59
|
+
c.execute("ALTER TABLE urls RENAME COLUMN h1 TO title")
|
|
60
|
+
except:
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
52
64
|
def seeds(seeds={}):
|
|
53
65
|
|
|
54
66
|
for url_type, url_prefix in seeds.items():
|
|
@@ -118,7 +130,7 @@ def get_urls_report():
|
|
|
118
130
|
SELECT
|
|
119
131
|
u.id,
|
|
120
132
|
u.url,
|
|
121
|
-
u.
|
|
133
|
+
u.title
|
|
122
134
|
FROM urls u
|
|
123
135
|
INNER JOIN parent_url p
|
|
124
136
|
ON u.url = p.parent_url
|
|
@@ -127,9 +139,9 @@ def get_urls_report():
|
|
|
127
139
|
u.id,
|
|
128
140
|
u.url_type,
|
|
129
141
|
u.url,
|
|
130
|
-
COALESCE(u.
|
|
142
|
+
COALESCE(u.title, p.title) as title,
|
|
131
143
|
p.url as parent_url,
|
|
132
|
-
p.
|
|
144
|
+
p.title as parent_title
|
|
133
145
|
FROM urls u
|
|
134
146
|
LEFT JOIN parents p
|
|
135
147
|
ON u.parent_url = p.url
|
|
@@ -185,12 +197,12 @@ def get_url_like_unclassified(like_condition):
|
|
|
185
197
|
|
|
186
198
|
|
|
187
199
|
@use_connection
|
|
188
|
-
def add_url(url,
|
|
200
|
+
def add_url(url, title=None, parent_url=None):
|
|
189
201
|
url = clean_url(url)
|
|
190
202
|
c = conn.cursor()
|
|
191
203
|
|
|
192
|
-
if
|
|
193
|
-
|
|
204
|
+
if title is not None:
|
|
205
|
+
title = title.strip()
|
|
194
206
|
|
|
195
207
|
if parent_url is None:
|
|
196
208
|
parent_url = None
|
|
@@ -199,8 +211,8 @@ def add_url(url, h1=None, parent_url=None):
|
|
|
199
211
|
|
|
200
212
|
if len(get_url_by_url(url)) == 0:
|
|
201
213
|
c.execute(
|
|
202
|
-
"INSERT INTO urls (url,
|
|
203
|
-
(url,
|
|
214
|
+
"INSERT INTO urls (url, title, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
|
|
215
|
+
(url, title, parent_url, int(time.time())),
|
|
204
216
|
)
|
|
205
217
|
conn.commit()
|
|
206
218
|
|
|
@@ -239,20 +251,20 @@ def set_url_destiny(url, destiny):
|
|
|
239
251
|
|
|
240
252
|
|
|
241
253
|
@use_connection
|
|
242
|
-
def
|
|
254
|
+
def set_url_title(url, value):
|
|
243
255
|
value = str(value).strip()
|
|
244
256
|
url = clean_url(url)
|
|
245
257
|
c = conn.cursor()
|
|
246
|
-
c.execute("UPDATE urls SET
|
|
258
|
+
c.execute("UPDATE urls SET title = ? WHERE url = ?", (value, url))
|
|
247
259
|
conn.commit()
|
|
248
260
|
|
|
249
261
|
|
|
250
262
|
@use_connection
|
|
251
|
-
def
|
|
263
|
+
def set_url_title_by_id(id, value):
|
|
252
264
|
value = str(value).strip()
|
|
253
265
|
|
|
254
266
|
c = conn.cursor()
|
|
255
|
-
c.execute("UPDATE urls SET
|
|
267
|
+
c.execute("UPDATE urls SET title = ? WHERE id = ?", (value, id))
|
|
256
268
|
conn.commit()
|
|
257
269
|
|
|
258
270
|
|
|
@@ -427,16 +439,16 @@ def merge_dbs() -> None:
|
|
|
427
439
|
|
|
428
440
|
|
|
429
441
|
@use_connection
|
|
430
|
-
def merge_url(url,
|
|
442
|
+
def merge_url(url, title, last_touch, created_at, description, json):
|
|
431
443
|
url = clean_url(url)
|
|
432
444
|
c = conn.cursor()
|
|
433
445
|
|
|
434
|
-
if
|
|
435
|
-
|
|
446
|
+
if title is not None:
|
|
447
|
+
title = title.strip()
|
|
436
448
|
|
|
437
449
|
if len(get_url_by_url(url)) == 0:
|
|
438
450
|
c.execute(
|
|
439
|
-
"INSERT INTO urls (url,
|
|
440
|
-
(url,
|
|
451
|
+
"INSERT INTO urls (url, title, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
|
|
452
|
+
(url, title, last_touch, created_at, description, json),
|
|
441
453
|
)
|
|
442
454
|
conn.commit()
|
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
from ohmyscrapper.modules import seed
|
|
2
3
|
import pandas as pd
|
|
3
4
|
import time
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def classify_urls(recursive=False):
|
|
7
|
-
urls_manager.seeds()
|
|
8
8
|
df = urls_manager.get_urls_valid_prefix()
|
|
9
|
+
if len(df) == 0:
|
|
10
|
+
seed.seed()
|
|
11
|
+
classify_urls(recursive=recursive)
|
|
12
|
+
return
|
|
9
13
|
|
|
10
14
|
keep_alive = True
|
|
11
15
|
while keep_alive:
|
|
@@ -28,13 +28,13 @@ def process_ai_response(response):
|
|
|
28
28
|
url_parent = urls_manager.get_url_by_id(url_child_xml["id"])
|
|
29
29
|
if len(url_parent) > 0:
|
|
30
30
|
url_parent = url_parent.iloc[0]
|
|
31
|
-
|
|
32
|
-
del
|
|
33
|
-
del
|
|
34
|
-
|
|
31
|
+
title = url_child_xml.copy()
|
|
32
|
+
del title["id"]
|
|
33
|
+
del title["url"]
|
|
34
|
+
title = " - ".join(title.values())
|
|
35
35
|
if url_parent["description_links"] > 1 and url_child_xml["id"] != "":
|
|
36
|
-
print("-- child updated -- \n", url_child_xml["url"], ":",
|
|
37
|
-
urls_manager.
|
|
36
|
+
print("-- child updated -- \n", url_child_xml["url"], ":", title)
|
|
37
|
+
urls_manager.set_url_title(url_child_xml["url"], title)
|
|
38
38
|
urls_manager.set_url_ai_processed_by_url(
|
|
39
39
|
url_child_xml["url"], str(json.dumps(url_child_xml))
|
|
40
40
|
)
|
|
@@ -43,8 +43,8 @@ def process_ai_response(response):
|
|
|
43
43
|
url_parent["url"], "children-update"
|
|
44
44
|
)
|
|
45
45
|
else:
|
|
46
|
-
print("-- parent updated -- \n", url_parent["url"], ":",
|
|
47
|
-
urls_manager.
|
|
46
|
+
print("-- parent updated -- \n", url_parent["url"], ":", title)
|
|
47
|
+
urls_manager.set_url_title(url_parent["url"], title)
|
|
48
48
|
urls_manager.set_url_ai_processed_by_url(
|
|
49
49
|
url_parent["url"], str(json.dumps(url_child_xml))
|
|
50
50
|
)
|
|
@@ -2,154 +2,132 @@ import ohmyscrapper.models.urls_manager as urls_manager
|
|
|
2
2
|
import ohmyscrapper.modules.sniff_url as sniff_url
|
|
3
3
|
import ohmyscrapper.modules.load_txt as load_txt
|
|
4
4
|
import ohmyscrapper.modules.classify_urls as classify_urls
|
|
5
|
+
from ohmyscrapper.core import config
|
|
5
6
|
|
|
6
7
|
import time
|
|
7
8
|
import random
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
def
|
|
11
|
-
if
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
if url_report["total-a-links"] < 5:
|
|
15
|
-
if "first-a-link" in url_report.keys():
|
|
16
|
-
url_destiny = url_report["first-a-link"]
|
|
17
|
-
else:
|
|
18
|
-
urls_manager.set_url_error(url=url["url"], value="error: no first-a-link")
|
|
19
|
-
if verbose:
|
|
20
|
-
print("no url for:", url["url"])
|
|
21
|
-
return
|
|
22
|
-
else:
|
|
23
|
-
if "og:url" in url_report.keys():
|
|
24
|
-
url_destiny = url_report["og:url"]
|
|
25
|
-
else:
|
|
26
|
-
urls_manager.set_url_error(url=url["url"], value="error: no og:url")
|
|
27
|
-
if verbose:
|
|
28
|
-
print("no url for:", url["url"])
|
|
29
|
-
return
|
|
30
|
-
if verbose:
|
|
31
|
-
print(url["url"], ">>", url_destiny)
|
|
32
|
-
urls_manager.add_url(url=url_destiny)
|
|
33
|
-
urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
|
|
34
|
-
|
|
11
|
+
def scrap_url(url, verbose=False):
|
|
12
|
+
if url["url_type"] is None:
|
|
13
|
+
url["url_type"] = "generic"
|
|
35
14
|
|
|
36
|
-
def process_linkedin_feed(url_report, url, verbose=False):
|
|
37
15
|
if verbose:
|
|
38
|
-
print("
|
|
16
|
+
print("\n\n", url["url_type"] + ":", url["url"])
|
|
39
17
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
18
|
+
try:
|
|
19
|
+
url_type = url["url_type"]
|
|
20
|
+
sniffing_config = config.get_url_sniffing()
|
|
21
|
+
|
|
22
|
+
if url_type not in sniffing_config:
|
|
23
|
+
default_type_sniffing = {
|
|
24
|
+
"bodytags": [{"h1": "title"}],
|
|
25
|
+
"metatags": [
|
|
26
|
+
{"og:title": "title"},
|
|
27
|
+
{"og:description": "description"},
|
|
28
|
+
{"description": "description"},
|
|
29
|
+
],
|
|
30
|
+
}
|
|
31
|
+
config.append_url_sniffing({url_type: default_type_sniffing})
|
|
32
|
+
sniffing_config = config.get_url_sniffing()
|
|
33
|
+
|
|
34
|
+
url_report = sniff_url.get_tags(
|
|
35
|
+
url=url["url"], sniffing_config=sniffing_config[url_type]
|
|
36
|
+
)
|
|
37
|
+
except Exception as e:
|
|
38
|
+
urls_manager.set_url_error(url=url["url"], value="error on scrapping")
|
|
39
|
+
urls_manager.touch_url(url=url["url"])
|
|
44
40
|
if verbose:
|
|
45
|
-
print("
|
|
41
|
+
print("\n\n!!! ERROR FOR:", url["url"])
|
|
42
|
+
print(
|
|
43
|
+
"\n\n!!! you can check the URL using the command sniff-url",
|
|
44
|
+
url["url"],
|
|
45
|
+
"\n\n",
|
|
46
|
+
)
|
|
46
47
|
return
|
|
47
48
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
49
|
+
process_sniffed_url(
|
|
50
|
+
url_report=url_report,
|
|
51
|
+
url=url,
|
|
52
|
+
sniffing_config=sniffing_config[url_type],
|
|
53
|
+
verbose=verbose,
|
|
54
|
+
)
|
|
53
55
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
print("linkedin_job")
|
|
57
|
-
changed = False
|
|
58
|
-
if "h1" in url_report.keys():
|
|
59
|
-
if verbose:
|
|
60
|
-
print(url["url"], ": ", url_report["h1"])
|
|
61
|
-
urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
|
|
62
|
-
changed = True
|
|
63
|
-
elif "og:title" in url_report.keys():
|
|
64
|
-
if verbose:
|
|
65
|
-
print(url["url"], ": ", url_report["og:title"])
|
|
66
|
-
urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
|
|
67
|
-
changed = True
|
|
56
|
+
urls_manager.set_url_json(url=url["url"], value=url_report["json"])
|
|
57
|
+
urls_manager.touch_url(url=url["url"])
|
|
68
58
|
|
|
69
|
-
|
|
70
|
-
urls_manager.set_url_description(
|
|
71
|
-
url=url["url"], value=url_report["description"]
|
|
72
|
-
)
|
|
73
|
-
changed = True
|
|
74
|
-
elif "og:description" in url_report.keys():
|
|
75
|
-
urls_manager.set_url_description(
|
|
76
|
-
url=url["url"], value=url_report["og:description"]
|
|
77
|
-
)
|
|
78
|
-
changed = True
|
|
79
|
-
if not changed:
|
|
80
|
-
urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
|
|
59
|
+
return
|
|
81
60
|
|
|
82
61
|
|
|
83
|
-
def
|
|
62
|
+
def process_sniffed_url(url_report, url, sniffing_config, verbose=False):
|
|
84
63
|
if verbose:
|
|
85
|
-
print("
|
|
64
|
+
print(url["url_type"])
|
|
86
65
|
print(url["url"])
|
|
87
66
|
changed = False
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
67
|
+
|
|
68
|
+
db_fields = {}
|
|
69
|
+
db_fields["title"] = None
|
|
70
|
+
db_fields["description"] = None
|
|
71
|
+
db_fields["url_destiny"] = None
|
|
72
|
+
|
|
73
|
+
if "metatags" in sniffing_config.keys():
|
|
74
|
+
for tag, bd_field in sniffing_config["metatags"].items():
|
|
75
|
+
if tag in url_report.keys():
|
|
76
|
+
if bd_field[:1] == "+":
|
|
77
|
+
if db_fields[bd_field[1:]] is None:
|
|
78
|
+
db_fields[bd_field[1:]] = ""
|
|
79
|
+
db_fields[bd_field[1:]] = (
|
|
80
|
+
db_fields[bd_field[1:]] + " " + url_report[tag]
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
db_fields[bd_field] = url_report[tag]
|
|
84
|
+
|
|
85
|
+
if "bodytags" in sniffing_config.keys():
|
|
86
|
+
for tag, bd_field in sniffing_config["bodytags"].items():
|
|
87
|
+
if tag in url_report.keys():
|
|
88
|
+
if bd_field[:1] == "+":
|
|
89
|
+
if db_fields[bd_field[1:]] is None:
|
|
90
|
+
db_fields[bd_field[1:]] = ""
|
|
91
|
+
db_fields[bd_field[1:]] = (
|
|
92
|
+
db_fields[bd_field[1:]] + " " + url_report[tag]
|
|
93
|
+
)
|
|
94
|
+
else:
|
|
95
|
+
db_fields[bd_field] = url_report[tag]
|
|
96
|
+
|
|
97
|
+
if (
|
|
98
|
+
"atags" in sniffing_config.keys()
|
|
99
|
+
and "first-tag-as-url_destiny" in sniffing_config["atags"].keys()
|
|
100
|
+
):
|
|
101
|
+
if (
|
|
102
|
+
url_report["total-a-links"]
|
|
103
|
+
< sniffing_config["atags"]["first-tag-as-url_destiny"]
|
|
104
|
+
):
|
|
105
|
+
if "first-a-link" in url_report.keys():
|
|
106
|
+
db_fields["url_destiny"] = url_report["first-a-link"]
|
|
107
|
+
|
|
108
|
+
if db_fields["title"] is not None:
|
|
109
|
+
urls_manager.set_url_title(url=url["url"], value=db_fields["title"])
|
|
102
110
|
changed = True
|
|
103
111
|
|
|
104
|
-
if description is not None:
|
|
105
|
-
urls_manager.set_url_description(url=url["url"], value=description)
|
|
112
|
+
if db_fields["description"] is not None:
|
|
113
|
+
urls_manager.set_url_description(url=url["url"], value=db_fields["description"])
|
|
106
114
|
description_links = load_txt.put_urls_from_string(
|
|
107
|
-
text_to_process=description, parent_url=url["url"]
|
|
115
|
+
text_to_process=db_fields["description"], parent_url=url["url"]
|
|
108
116
|
)
|
|
109
117
|
urls_manager.set_url_description_links(url=url["url"], value=description_links)
|
|
110
118
|
|
|
111
|
-
|
|
112
|
-
urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
def scrap_url(url, verbose=False):
|
|
116
|
-
# TODO: Need to change this
|
|
117
|
-
|
|
118
|
-
if url["url_type"] is None:
|
|
119
|
-
if verbose:
|
|
120
|
-
print("\n\ngeneric:", url["url"])
|
|
121
|
-
url["url_type"] = "generic"
|
|
122
|
-
else:
|
|
123
|
-
if verbose:
|
|
124
|
-
print("\n\n", url["url_type"] + ":", url["url"])
|
|
125
|
-
try:
|
|
126
|
-
url_report = sniff_url.get_tags(url=url["url"])
|
|
127
|
-
except Exception as e:
|
|
128
|
-
urls_manager.set_url_error(url=url["url"], value="error")
|
|
129
|
-
urls_manager.touch_url(url=url["url"])
|
|
130
|
-
if verbose:
|
|
131
|
-
print("\n\n!!! ERROR FOR:", url["url"])
|
|
132
|
-
print(
|
|
133
|
-
"\n\n!!! you can check the URL using the command sniff-url",
|
|
134
|
-
url["url"],
|
|
135
|
-
"\n\n",
|
|
136
|
-
)
|
|
137
|
-
return
|
|
138
|
-
|
|
139
|
-
if url["url_type"] == "linkedin_redirect":
|
|
140
|
-
process_linkedin_redirect(url_report=url_report, url=url, verbose=verbose)
|
|
141
|
-
|
|
142
|
-
if url["url_type"] == "linkedin_feed":
|
|
143
|
-
process_linkedin_feed(url_report=url_report, url=url, verbose=verbose)
|
|
144
|
-
|
|
145
|
-
if url["url_type"] == "linkedin_job":
|
|
146
|
-
process_linkedin_job(url_report=url_report, url=url, verbose=verbose)
|
|
119
|
+
changed = True
|
|
147
120
|
|
|
148
|
-
if
|
|
149
|
-
|
|
121
|
+
if db_fields["url_destiny"] is not None:
|
|
122
|
+
urls_manager.add_url(url=db_fields["url_destiny"])
|
|
123
|
+
urls_manager.set_url_destiny(url=url["url"], destiny=db_fields["url_destiny"])
|
|
124
|
+
changed = True
|
|
150
125
|
|
|
151
|
-
|
|
152
|
-
|
|
126
|
+
if not changed:
|
|
127
|
+
urls_manager.set_url_error(
|
|
128
|
+
url=url["url"],
|
|
129
|
+
value="error: no title, url_destiny or description was founded",
|
|
130
|
+
)
|
|
153
131
|
|
|
154
132
|
|
|
155
133
|
def isNaN(num):
|
|
@@ -3,39 +3,62 @@ from bs4 import BeautifulSoup
|
|
|
3
3
|
import json
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
def sniff_url(
|
|
6
|
+
def sniff_url(
|
|
7
|
+
url="https://www.linkedin.com/in/cesardesouzacardoso/",
|
|
8
|
+
silent=False,
|
|
9
|
+
sniffing_config={},
|
|
10
|
+
):
|
|
11
|
+
if "metatags" in sniffing_config:
|
|
12
|
+
metatags_to_search = sniffing_config["metatags"]
|
|
13
|
+
else:
|
|
14
|
+
metatags_to_search = [
|
|
15
|
+
"description",
|
|
16
|
+
"og:url",
|
|
17
|
+
"og:title",
|
|
18
|
+
"og:description",
|
|
19
|
+
"og:type",
|
|
20
|
+
"lnkd:url",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
if "bodytags" in sniffing_config:
|
|
24
|
+
body_tags_to_search = sniffing_config["bodytags"]
|
|
25
|
+
else:
|
|
26
|
+
body_tags_to_search = {
|
|
27
|
+
"h1": "",
|
|
28
|
+
"h2": "",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
if type(metatags_to_search) is dict:
|
|
32
|
+
metatags_to_search = list(metatags_to_search.keys())
|
|
33
|
+
|
|
34
|
+
# force clean concatenate without any separator
|
|
35
|
+
if type(body_tags_to_search) is dict:
|
|
36
|
+
body_tags_to_search = list(body_tags_to_search.keys())
|
|
37
|
+
|
|
38
|
+
if type(body_tags_to_search) is list:
|
|
39
|
+
body_tags_to_search = dict.fromkeys(body_tags_to_search, " ")
|
|
40
|
+
|
|
7
41
|
if not silent:
|
|
8
42
|
print("checking url:", url)
|
|
9
43
|
|
|
10
44
|
r = requests.get(url=url)
|
|
11
45
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
12
46
|
|
|
13
|
-
metatags_to_search = [
|
|
14
|
-
"description",
|
|
15
|
-
"og:url",
|
|
16
|
-
"og:title",
|
|
17
|
-
"og:description",
|
|
18
|
-
"og:type",
|
|
19
|
-
"lnkd:url",
|
|
20
|
-
]
|
|
21
|
-
|
|
22
|
-
text_tags_to_search = {
|
|
23
|
-
"h1": "",
|
|
24
|
-
"h2": "|",
|
|
25
|
-
}
|
|
26
|
-
|
|
27
47
|
final_report = {}
|
|
28
48
|
final_report["scrapped-url"] = url
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
49
|
+
if len(metatags_to_search) > 0:
|
|
50
|
+
final_report.update(
|
|
51
|
+
_extract_meta_tags(
|
|
52
|
+
soup=soup, silent=silent, metatags_to_search=metatags_to_search
|
|
53
|
+
)
|
|
32
54
|
)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
55
|
+
|
|
56
|
+
if len(body_tags_to_search) > 0:
|
|
57
|
+
final_report.update(
|
|
58
|
+
_extract_text_tags(
|
|
59
|
+
soup=soup, silent=silent, body_tags_to_search=body_tags_to_search
|
|
60
|
+
)
|
|
37
61
|
)
|
|
38
|
-
)
|
|
39
62
|
final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent)
|
|
40
63
|
final_report = _complementary_report(final_report, soup, silent).copy()
|
|
41
64
|
final_report["json"] = json.dumps(final_report)
|
|
@@ -85,12 +108,12 @@ def _extract_meta_tags(soup, silent, metatags_to_search):
|
|
|
85
108
|
return valid_meta_tags
|
|
86
109
|
|
|
87
110
|
|
|
88
|
-
def _extract_text_tags(soup, silent,
|
|
111
|
+
def _extract_text_tags(soup, silent, body_tags_to_search):
|
|
89
112
|
valid_text_tags = {}
|
|
90
113
|
if not silent:
|
|
91
114
|
print("\n\n\n\n---- all <text> tags ---\n")
|
|
92
115
|
i = 0
|
|
93
|
-
for text_tag, separator in
|
|
116
|
+
for text_tag, separator in body_tags_to_search.items():
|
|
94
117
|
if len(soup.find_all(text_tag)) > 0:
|
|
95
118
|
valid_text_tags[text_tag] = []
|
|
96
119
|
for obj_tag in soup.find_all(text_tag):
|
|
@@ -128,5 +151,5 @@ def _complementary_report(final_report, soup, silent):
|
|
|
128
151
|
return final_report
|
|
129
152
|
|
|
130
153
|
|
|
131
|
-
def get_tags(url):
|
|
132
|
-
return sniff_url(url=url, silent=True)
|
|
154
|
+
def get_tags(url, sniffing_config={}):
|
|
155
|
+
return sniff_url(url=url, silent=True, sniffing_config=sniffing_config)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ohmyscrapper
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
|
|
5
5
|
Author: Cesar Cardoso
|
|
6
6
|
Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
|
|
@@ -16,7 +16,7 @@ Requires-Dist: urlextract>=1.9.0
|
|
|
16
16
|
Requires-Python: >=3.11
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
|
|
19
|
-
# 🐶 OhMyScrapper - v0.
|
|
19
|
+
# 🐶 OhMyScrapper - v0.7.0
|
|
20
20
|
|
|
21
21
|
OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
|
|
22
22
|
final report with general information about job positions.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
ohmyscrapper/__init__.py,sha256=w5Ty9eszf8tEv72IQrFov0YbZWMqsraq448xhX3YGQs,6493
|
|
2
|
+
ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
|
|
3
|
+
ohmyscrapper/core/config.py,sha256=i_RA-zReNQIWWmsFar85qzRUqdqvTFMPeCP7Hya7ltU,2996
|
|
4
|
+
ohmyscrapper/core/config_files.py,sha256=KC3yChTnlclclU9EKTqFBoAu9p6XdOKuegub5NPYDDY,2434
|
|
5
|
+
ohmyscrapper/core/default_files/config.yaml,sha256=bgPBVlze2tOCbyrA47h_5BJ35UsXnqsjQszzy0vn-Pw,248
|
|
6
|
+
ohmyscrapper/core/default_files/url_sniffing.yaml,sha256=MKdVR5HQ1i2yTRw2ijzxPSmIyhUno_R4L2k17r3EBBc,417
|
|
7
|
+
ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
|
|
8
|
+
ohmyscrapper/models/urls_manager.py,sha256=FC1j72M1gzNwC_PzPqnew986b-BI6s7zUv8Z7HiM1M0,11849
|
|
9
|
+
ohmyscrapper/modules/classify_urls.py,sha256=GhiosAQUITy1DQe_PksYV9QRKVTgpkSE28dkutzbWVA,1038
|
|
10
|
+
ohmyscrapper/modules/load_txt.py,sha256=dNkUZ2ehBiPx-q4fPczRiHFvnpzCrjeycFtexhWGmEE,3967
|
|
11
|
+
ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
|
|
12
|
+
ohmyscrapper/modules/process_with_ai.py,sha256=kl39Jzl-PUwh6AfmTZ9SLFUYs9Sk4biqgt8rNz3X1FA,7255
|
|
13
|
+
ohmyscrapper/modules/scrap_urls.py,sha256=CNoEC-d1r-u4qxnEVimm4ctP6MJGdU8y8VI2Nx0bBdM,6033
|
|
14
|
+
ohmyscrapper/modules/seed.py,sha256=qDUE7TWx9iNQEzqThK4p7g8pTZjdpkmoqI8kOo_zdtk,983
|
|
15
|
+
ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
|
|
16
|
+
ohmyscrapper/modules/sniff_url.py,sha256=zJ2Uox2aUdQibL4UFLxg3t7GqJ7WwWEl0q3QSUbMEbc,4960
|
|
17
|
+
ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
|
|
18
|
+
ohmyscrapper-0.7.0.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
|
|
19
|
+
ohmyscrapper-0.7.0.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
|
|
20
|
+
ohmyscrapper-0.7.0.dist-info/METADATA,sha256=Doakf4oDT6oskPGdSlEoRJHBxUmm9FhWaHfDlNIfNuM,4096
|
|
21
|
+
ohmyscrapper-0.7.0.dist-info/RECORD,,
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
ohmyscrapper/__init__.py,sha256=TGOizxll-06nyJdYSM8SRUccQ5Xhv6dDNW6sIbuH0Mk,6493
|
|
2
|
-
ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
|
|
3
|
-
ohmyscrapper/core/config.py,sha256=_me0T6IQqz7bA6Kh6IofNrb-o-07nipcLozUuPrz0l4,2722
|
|
4
|
-
ohmyscrapper/core/config_files.py,sha256=KC3yChTnlclclU9EKTqFBoAu9p6XdOKuegub5NPYDDY,2434
|
|
5
|
-
ohmyscrapper/core/default_files/config.yaml,sha256=9nMOhnnJUcZudXUq5WBEXCCgezfUKI3m4azIuSch_wQ,214
|
|
6
|
-
ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
|
|
7
|
-
ohmyscrapper/models/urls_manager.py,sha256=93WvHnk89hA2BfJfDsD2JlZBeRxo2T_F3FfypiRKKHs,11523
|
|
8
|
-
ohmyscrapper/modules/classify_urls.py,sha256=4rt7_iPDcCGHhJg-f75wBfFmvjdvQj1xFFP-if_IeFM,926
|
|
9
|
-
ohmyscrapper/modules/load_txt.py,sha256=dNkUZ2ehBiPx-q4fPczRiHFvnpzCrjeycFtexhWGmEE,3967
|
|
10
|
-
ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
|
|
11
|
-
ohmyscrapper/modules/process_with_ai.py,sha256=Th-HMJzQYGQ4UBG8AGFsF5cCKIa1HlPATfmGLTTAE24,7222
|
|
12
|
-
ohmyscrapper/modules/scrap_urls.py,sha256=dxpvPyJWtmQj1vZ6IgnhcICWw1eOxYOeplDfZzDTLw4,6864
|
|
13
|
-
ohmyscrapper/modules/seed.py,sha256=qDUE7TWx9iNQEzqThK4p7g8pTZjdpkmoqI8kOo_zdtk,983
|
|
14
|
-
ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
|
|
15
|
-
ohmyscrapper/modules/sniff_url.py,sha256=dF6Nv54TC1Si-FRyqtw4V2WNk3NqaJ1h_PzwZm3UNzk,4126
|
|
16
|
-
ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
|
|
17
|
-
ohmyscrapper-0.6.1.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
|
|
18
|
-
ohmyscrapper-0.6.1.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
|
|
19
|
-
ohmyscrapper-0.6.1.dist-info/METADATA,sha256=k06ZCfkLkDuy_GvCj6jAFq2xfCUA5gN8cVlDH-2Q6Bs,4096
|
|
20
|
-
ohmyscrapper-0.6.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|