ohmyscrapper 0.7.1__tar.gz → 0.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/PKG-INFO +7 -3
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/README.md +1 -1
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/pyproject.toml +8 -2
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/__init__.py +11 -2
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/core/config.py +8 -2
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/core/config_files.py +33 -3
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/core/default_files/config.yaml +5 -0
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/core/default_files/url_sniffing.yaml +4 -0
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/models/urls_manager.py +16 -2
- ohmyscrapper-0.8.1/src/ohmyscrapper/modules/browser.py +27 -0
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/modules/load_txt.py +7 -3
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/modules/scrap_urls.py +15 -3
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/modules/seed.py +4 -1
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/modules/sniff_url.py +60 -11
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/__main__.py +0 -0
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/core/default_files/url_types.yaml +0 -0
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/modules/classify_urls.py +0 -0
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/modules/merge_dbs.py +0 -0
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/modules/process_with_ai.py +0 -0
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/modules/show.py +0 -0
- {ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/modules/untouch_all.py +0 -0
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ohmyscrapper
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
|
|
5
5
|
Author: Cesar Cardoso
|
|
6
6
|
Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
|
|
7
|
+
License-Expression: MIT
|
|
7
8
|
Requires-Dist: beautifulsoup4>=4.14.3
|
|
8
9
|
Requires-Dist: google-genai>=1.55.0
|
|
9
10
|
Requires-Dist: markdown>=3.10
|
|
@@ -12,11 +13,14 @@ Requires-Dist: python-dotenv>=1.2.1
|
|
|
12
13
|
Requires-Dist: pyyaml>=6.0.3
|
|
13
14
|
Requires-Dist: requests>=2.32.5
|
|
14
15
|
Requires-Dist: rich>=14.2.0
|
|
16
|
+
Requires-Dist: selenium>=4.39.0
|
|
15
17
|
Requires-Dist: urlextract>=1.9.0
|
|
16
18
|
Requires-Python: >=3.11
|
|
19
|
+
Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
|
|
20
|
+
Project-URL: Repository, https://github.com/bouli/ohmyscrapper
|
|
17
21
|
Description-Content-Type: text/markdown
|
|
18
22
|
|
|
19
|
-
# 🐶 OhMyScrapper - v0.
|
|
23
|
+
# 🐶 OhMyScrapper - v0.8.1
|
|
20
24
|
|
|
21
25
|
OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
|
|
22
26
|
final report with general information about job positions.
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ohmyscrapper"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.8.1"
|
|
4
|
+
license = "MIT"
|
|
4
5
|
description = "OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions."
|
|
5
6
|
readme = "README.md"
|
|
6
7
|
authors = [
|
|
@@ -16,9 +17,14 @@ dependencies = [
|
|
|
16
17
|
"pyyaml>=6.0.3",
|
|
17
18
|
"requests>=2.32.5",
|
|
18
19
|
"rich>=14.2.0",
|
|
20
|
+
"selenium>=4.39.0",
|
|
19
21
|
"urlextract>=1.9.0",
|
|
20
22
|
]
|
|
21
23
|
|
|
24
|
+
[project.urls]
|
|
25
|
+
Repository = "https://github.com/bouli/ohmyscrapper"
|
|
26
|
+
Changelog = "https://github.com/bouli/ohmyscrapper/releases/latest"
|
|
27
|
+
|
|
22
28
|
[project.scripts]
|
|
23
29
|
ohmyscrapper = "ohmyscrapper:main"
|
|
24
30
|
|
|
@@ -29,7 +35,7 @@ build-backend = "uv_build"
|
|
|
29
35
|
[tool.bumpversion]
|
|
30
36
|
tag = true
|
|
31
37
|
tag_name = "v{new_version}"
|
|
32
|
-
pre_commit_hooks = ["uvx black ./src", "git add src", "
|
|
38
|
+
pre_commit_hooks = ["uvx black ./src", "git add src", "uv sync --upgrade", "git add uv.lock"]
|
|
33
39
|
commit = true
|
|
34
40
|
|
|
35
41
|
[[tool.bumpversion.files]]
|
|
@@ -20,7 +20,7 @@ from ohmyscrapper.core.config import update
|
|
|
20
20
|
|
|
21
21
|
def main():
|
|
22
22
|
parser = argparse.ArgumentParser(prog="ohmyscrapper")
|
|
23
|
-
parser.add_argument("--version", action="version", version="%(prog)s v0.
|
|
23
|
+
parser.add_argument("--version", action="version", version="%(prog)s v0.8.1")
|
|
24
24
|
|
|
25
25
|
update()
|
|
26
26
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
@@ -53,6 +53,14 @@ def main():
|
|
|
53
53
|
help="Add all `url_types` from the bank to the `/ohmyscrapper/url_types.yaml` file.",
|
|
54
54
|
action="store_true",
|
|
55
55
|
)
|
|
56
|
+
|
|
57
|
+
seed_parser.add_argument(
|
|
58
|
+
"--reset",
|
|
59
|
+
default=False,
|
|
60
|
+
help="Reset all `url_types`.",
|
|
61
|
+
action="store_true",
|
|
62
|
+
)
|
|
63
|
+
|
|
56
64
|
untouch_parser = subparsers.add_parser(
|
|
57
65
|
"untouch-all", help="Untouch all urls. That resets classification"
|
|
58
66
|
)
|
|
@@ -137,7 +145,7 @@ def main():
|
|
|
137
145
|
if args.export:
|
|
138
146
|
export_url_types_to_file()
|
|
139
147
|
else:
|
|
140
|
-
seed()
|
|
148
|
+
seed(args.reset)
|
|
141
149
|
return
|
|
142
150
|
|
|
143
151
|
if args.command == "untouch-all":
|
|
@@ -191,6 +199,7 @@ def main():
|
|
|
191
199
|
return
|
|
192
200
|
|
|
193
201
|
if args.command == "start":
|
|
202
|
+
seed()
|
|
194
203
|
if args.input != None:
|
|
195
204
|
load_txt(file_name=args.input)
|
|
196
205
|
else:
|
|
@@ -39,6 +39,12 @@ def get_ai(param):
|
|
|
39
39
|
)
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
def get_sniffing(param):
|
|
43
|
+
return config_files.get_param(
|
|
44
|
+
parent_param="sniffing", param=param, default_app_dir=default_app_dir
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
42
48
|
def load_config(force_default=False):
|
|
43
49
|
config_file_name = "config.yaml"
|
|
44
50
|
config_params = config_files.create_and_read_config_file(
|
|
@@ -63,14 +69,14 @@ def url_types_file_exists():
|
|
|
63
69
|
def get_url_types():
|
|
64
70
|
url_types_file = get_files("url_types")
|
|
65
71
|
return config_files.create_and_read_config_file(
|
|
66
|
-
url_types_file, default_app_dir=default_app_dir
|
|
72
|
+
url_types_file, default_app_dir=default_app_dir, complete_file=False
|
|
67
73
|
)
|
|
68
74
|
|
|
69
75
|
|
|
70
76
|
def get_url_sniffing():
|
|
71
77
|
file = get_files("url_sniffing")
|
|
72
78
|
return config_files.create_and_read_config_file(
|
|
73
|
-
file, default_app_dir=default_app_dir
|
|
79
|
+
file, default_app_dir=default_app_dir, complete_file=False
|
|
74
80
|
)
|
|
75
81
|
|
|
76
82
|
|
|
@@ -2,16 +2,34 @@ import os
|
|
|
2
2
|
import yaml
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
def create_and_read_config_file(
|
|
5
|
+
def create_and_read_config_file(
|
|
6
|
+
file_name, default_app_dir, force_default=False, complete_file=True
|
|
7
|
+
):
|
|
6
8
|
config_file = config_file_path(file_name, default_app_dir)
|
|
9
|
+
default_config_params = _get_default_file(default_file=file_name)
|
|
7
10
|
if force_default or not os.path.exists(config_file):
|
|
8
|
-
config_params = _get_default_file(default_file=file_name)
|
|
9
11
|
overwrite_config_file(
|
|
10
|
-
data=
|
|
12
|
+
data=default_config_params,
|
|
13
|
+
file_name=file_name,
|
|
14
|
+
default_app_dir=default_app_dir,
|
|
11
15
|
)
|
|
16
|
+
config_params = default_config_params
|
|
12
17
|
else:
|
|
13
18
|
with open(config_file, "r") as f:
|
|
14
19
|
config_params = yaml.safe_load(f.read())
|
|
20
|
+
if complete_file:
|
|
21
|
+
if complete_config_file(
|
|
22
|
+
config_params=config_params,
|
|
23
|
+
default_config_params=default_config_params,
|
|
24
|
+
file_name=file_name,
|
|
25
|
+
default_app_dir=default_app_dir,
|
|
26
|
+
):
|
|
27
|
+
config_params = create_and_read_config_file(
|
|
28
|
+
file_name=file_name,
|
|
29
|
+
default_app_dir=default_app_dir,
|
|
30
|
+
force_default=force_default,
|
|
31
|
+
)
|
|
32
|
+
|
|
15
33
|
if config_params is None:
|
|
16
34
|
config_params = create_and_read_config_file(
|
|
17
35
|
file_name=file_name, default_app_dir=default_app_dir, force_default=True
|
|
@@ -19,6 +37,18 @@ def create_and_read_config_file(file_name, default_app_dir, force_default=False)
|
|
|
19
37
|
return config_params
|
|
20
38
|
|
|
21
39
|
|
|
40
|
+
def complete_config_file(
|
|
41
|
+
config_params, default_config_params, file_name, default_app_dir
|
|
42
|
+
):
|
|
43
|
+
has_updated = False
|
|
44
|
+
for key, values in default_config_params.items():
|
|
45
|
+
if key not in config_params.keys():
|
|
46
|
+
has_updated = True
|
|
47
|
+
data = {key: values}
|
|
48
|
+
append_config_file(data, file_name, default_app_dir)
|
|
49
|
+
return has_updated
|
|
50
|
+
|
|
51
|
+
|
|
22
52
|
def overwrite_config_file(data, file_name, default_app_dir):
|
|
23
53
|
config_file = config_file_path(file_name, default_app_dir)
|
|
24
54
|
with open(config_file, "+w") as f:
|
|
@@ -69,6 +69,14 @@ def seeds(seeds={}):
|
|
|
69
69
|
return True
|
|
70
70
|
|
|
71
71
|
|
|
72
|
+
@use_connection
|
|
73
|
+
def reset_seeds():
|
|
74
|
+
sql = "DELETE FROM urls_valid_prefix WHERE 1 = 1"
|
|
75
|
+
c = conn.cursor()
|
|
76
|
+
c.execute(sql)
|
|
77
|
+
conn.commit()
|
|
78
|
+
|
|
79
|
+
|
|
72
80
|
@use_connection
|
|
73
81
|
def add_urls_valid_prefix(url_prefix, url_type):
|
|
74
82
|
|
|
@@ -198,6 +206,8 @@ def get_url_like_unclassified(like_condition):
|
|
|
198
206
|
|
|
199
207
|
@use_connection
|
|
200
208
|
def add_url(url, title=None, parent_url=None):
|
|
209
|
+
if url[:1] == "/":
|
|
210
|
+
return
|
|
201
211
|
url = clean_url(url)
|
|
202
212
|
c = conn.cursor()
|
|
203
213
|
|
|
@@ -340,7 +350,9 @@ def set_url_error(url, value):
|
|
|
340
350
|
@use_connection
|
|
341
351
|
def set_url_type_by_id(url_id, url_type):
|
|
342
352
|
c = conn.cursor()
|
|
343
|
-
c.execute(
|
|
353
|
+
c.execute(
|
|
354
|
+
f"UPDATE urls SET url_type = '{url_type}', last_touch = NULL WHERE id = {url_id}"
|
|
355
|
+
)
|
|
344
356
|
conn.commit()
|
|
345
357
|
|
|
346
358
|
|
|
@@ -392,8 +404,10 @@ def touch_url(url):
|
|
|
392
404
|
@use_connection
|
|
393
405
|
def untouch_url(url):
|
|
394
406
|
url = clean_url(url)
|
|
407
|
+
url = str(url.strip())
|
|
408
|
+
|
|
395
409
|
c = conn.cursor()
|
|
396
|
-
c.execute("UPDATE urls SET last_touch = NULL WHERE url =
|
|
410
|
+
c.execute(f"UPDATE urls SET last_touch = NULL, url_type = NULL WHERE url = '{url}'")
|
|
397
411
|
conn.commit()
|
|
398
412
|
|
|
399
413
|
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from selenium import webdriver
|
|
2
|
+
from ohmyscrapper.core.config import get_sniffing
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_driver():
|
|
6
|
+
if get_sniffing("use-browser") == "safari":
|
|
7
|
+
from selenium.webdriver.safari.options import Options
|
|
8
|
+
|
|
9
|
+
options = Options()
|
|
10
|
+
driver = webdriver.Safari(options=options)
|
|
11
|
+
elif get_sniffing("use-browser") == "firefox":
|
|
12
|
+
from selenium.webdriver.firefox.options import Options
|
|
13
|
+
|
|
14
|
+
options = Options()
|
|
15
|
+
driver = webdriver.Firefox(options=options)
|
|
16
|
+
elif get_sniffing("use-browser") == "ie":
|
|
17
|
+
from selenium.webdriver.ie.options import Options
|
|
18
|
+
|
|
19
|
+
options = Options()
|
|
20
|
+
driver = webdriver.Ie(options=options)
|
|
21
|
+
else: # default: chrome
|
|
22
|
+
from selenium.webdriver.chrome.options import Options
|
|
23
|
+
|
|
24
|
+
options = Options()
|
|
25
|
+
driver = webdriver.Chrome(options=options)
|
|
26
|
+
|
|
27
|
+
return driver
|
|
@@ -19,14 +19,16 @@ def load_txt(file_name="input", verbose=False):
|
|
|
19
19
|
|
|
20
20
|
text_file_content = ""
|
|
21
21
|
if file_name is not None and not os.path.isdir(file_name):
|
|
22
|
-
print(f"📖 reading file `{file_name}`... ")
|
|
23
22
|
if not os.path.exists(file_name):
|
|
24
23
|
if file_name.startswith("https://") or file_name.startswith("http://"):
|
|
24
|
+
print(f"📖 reading url `{file_name}`... ")
|
|
25
25
|
text_file_content = " " + file_name + " "
|
|
26
|
+
urls_manager.untouch_url(url=file_name)
|
|
26
27
|
else:
|
|
27
28
|
print(f"\n file `{file_name}` not found.")
|
|
28
29
|
return
|
|
29
30
|
else:
|
|
31
|
+
print(f"📖 reading file `{file_name}`... ")
|
|
30
32
|
text_file_content = _increment_file_name(
|
|
31
33
|
text_file_content=text_file_content, file_name=file_name
|
|
32
34
|
)
|
|
@@ -51,13 +53,15 @@ def load_txt(file_name="input", verbose=False):
|
|
|
51
53
|
file_name=os.path.join(dir_files, text_files[0]),
|
|
52
54
|
)
|
|
53
55
|
else:
|
|
54
|
-
print("\
|
|
56
|
+
print("\nFiles list:")
|
|
55
57
|
for index, file in enumerate(text_files):
|
|
56
58
|
print(f"[{index}]:", os.path.join(dir_files, file))
|
|
57
59
|
|
|
58
60
|
text_file_option = -1
|
|
59
61
|
while text_file_option < 0 or text_file_option >= len(text_files):
|
|
60
|
-
text_file_option = input(
|
|
62
|
+
text_file_option = input(
|
|
63
|
+
"Choose a text file. Use `*` for process all and `q` to quit. Enter the file number: "
|
|
64
|
+
)
|
|
61
65
|
if text_file_option == "*":
|
|
62
66
|
for file in text_files:
|
|
63
67
|
text_file_content = _increment_file_name(
|
|
@@ -2,13 +2,14 @@ import ohmyscrapper.models.urls_manager as urls_manager
|
|
|
2
2
|
import ohmyscrapper.modules.sniff_url as sniff_url
|
|
3
3
|
import ohmyscrapper.modules.load_txt as load_txt
|
|
4
4
|
import ohmyscrapper.modules.classify_urls as classify_urls
|
|
5
|
+
import ohmyscrapper.modules.browser as browser
|
|
5
6
|
from ohmyscrapper.core import config
|
|
6
7
|
|
|
7
8
|
import time
|
|
8
9
|
import random
|
|
9
10
|
|
|
10
11
|
|
|
11
|
-
def scrap_url(url, verbose=False):
|
|
12
|
+
def scrap_url(url, verbose=False, driver=None):
|
|
12
13
|
if url["url_type"] is None:
|
|
13
14
|
url["url_type"] = "generic"
|
|
14
15
|
|
|
@@ -32,7 +33,7 @@ def scrap_url(url, verbose=False):
|
|
|
32
33
|
sniffing_config = config.get_url_sniffing()
|
|
33
34
|
|
|
34
35
|
url_report = sniff_url.get_tags(
|
|
35
|
-
url=url["url"], sniffing_config=sniffing_config[url_type]
|
|
36
|
+
url=url["url"], sniffing_config=sniffing_config[url_type], driver=driver
|
|
36
37
|
)
|
|
37
38
|
except Exception as e:
|
|
38
39
|
urls_manager.set_url_error(url=url["url"], value="error on scrapping")
|
|
@@ -104,6 +105,12 @@ def process_sniffed_url(url_report, url, sniffing_config, verbose=False):
|
|
|
104
105
|
):
|
|
105
106
|
if "first-a-link" in url_report.keys():
|
|
106
107
|
db_fields["url_destiny"] = url_report["first-a-link"]
|
|
108
|
+
if (
|
|
109
|
+
"atags" in sniffing_config.keys()
|
|
110
|
+
and "load_links" in sniffing_config["atags"].keys()
|
|
111
|
+
):
|
|
112
|
+
for a_link in url_report["a_links"]:
|
|
113
|
+
urls_manager.add_url(url=a_link["href"], parent_url=url["url"])
|
|
107
114
|
|
|
108
115
|
if db_fields["title"] is not None:
|
|
109
116
|
urls_manager.set_url_title(url=url["url"], value=db_fields["title"])
|
|
@@ -141,6 +148,7 @@ def scrap_urls(
|
|
|
141
148
|
only_parents=True,
|
|
142
149
|
verbose=False,
|
|
143
150
|
n_urls=0,
|
|
151
|
+
driver=None,
|
|
144
152
|
):
|
|
145
153
|
limit = 10
|
|
146
154
|
classify_urls.classify_urls()
|
|
@@ -164,7 +172,10 @@ def scrap_urls(
|
|
|
164
172
|
time.sleep(wait)
|
|
165
173
|
|
|
166
174
|
print("🐕 Scrapper is sniffing the url...")
|
|
167
|
-
|
|
175
|
+
|
|
176
|
+
if driver is None and config.get_sniffing("use-browser"):
|
|
177
|
+
driver = browser.get_driver()
|
|
178
|
+
scrap_url(url=url, verbose=verbose, driver=driver)
|
|
168
179
|
|
|
169
180
|
n_urls = n_urls + len(urls)
|
|
170
181
|
print(f"-- 🗃️ {n_urls} scraped urls...")
|
|
@@ -182,6 +193,7 @@ def scrap_urls(
|
|
|
182
193
|
only_parents=only_parents,
|
|
183
194
|
verbose=verbose,
|
|
184
195
|
n_urls=n_urls,
|
|
196
|
+
driver=driver,
|
|
185
197
|
)
|
|
186
198
|
else:
|
|
187
199
|
print("scrapping is over...")
|
|
@@ -2,7 +2,10 @@ import ohmyscrapper.models.urls_manager as urls_manager
|
|
|
2
2
|
from ohmyscrapper.core import config
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
def seed():
|
|
5
|
+
def seed(reset=False):
|
|
6
|
+
if reset:
|
|
7
|
+
urls_manager.reset_seeds()
|
|
8
|
+
|
|
6
9
|
if not config.url_types_file_exists():
|
|
7
10
|
db_url_types = urls_manager.get_urls_valid_prefix()
|
|
8
11
|
if len(db_url_types) > 0:
|
|
@@ -1,13 +1,18 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
from bs4 import BeautifulSoup
|
|
3
3
|
import json
|
|
4
|
+
from ohmyscrapper.core import config
|
|
5
|
+
import ohmyscrapper.modules.browser as browser
|
|
6
|
+
import time
|
|
4
7
|
|
|
5
8
|
|
|
6
9
|
def sniff_url(
|
|
7
10
|
url="https://www.linkedin.com/in/cesardesouzacardoso/",
|
|
8
11
|
silent=False,
|
|
9
12
|
sniffing_config={},
|
|
13
|
+
driver=None,
|
|
10
14
|
):
|
|
15
|
+
final_report = {}
|
|
11
16
|
if "metatags" in sniffing_config:
|
|
12
17
|
metatags_to_search = sniffing_config["metatags"]
|
|
13
18
|
else:
|
|
@@ -41,10 +46,18 @@ def sniff_url(
|
|
|
41
46
|
if not silent:
|
|
42
47
|
print("checking url:", url)
|
|
43
48
|
|
|
44
|
-
|
|
45
|
-
|
|
49
|
+
try:
|
|
50
|
+
r = get_url(url=url, driver=driver)
|
|
51
|
+
soup = BeautifulSoup(r, "html.parser")
|
|
52
|
+
except requests.exceptions.ReadTimeout:
|
|
53
|
+
url_domain = url.split("/")[2]
|
|
54
|
+
final_report["error"] = (
|
|
55
|
+
f"!!! timeout (10 seconds) while checking the url with domain: `{url_domain}` !!!"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
print(f"\n\n{final_report['error']}\n\n")
|
|
59
|
+
soup = BeautifulSoup("", "html.parser")
|
|
46
60
|
|
|
47
|
-
final_report = {}
|
|
48
61
|
final_report["scrapped-url"] = url
|
|
49
62
|
if len(metatags_to_search) > 0:
|
|
50
63
|
final_report.update(
|
|
@@ -59,14 +72,14 @@ def sniff_url(
|
|
|
59
72
|
soup=soup, silent=silent, body_tags_to_search=body_tags_to_search
|
|
60
73
|
)
|
|
61
74
|
)
|
|
62
|
-
final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent)
|
|
75
|
+
final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent, url=url)
|
|
63
76
|
final_report = _complementary_report(final_report, soup, silent).copy()
|
|
64
77
|
final_report["json"] = json.dumps(final_report)
|
|
65
78
|
|
|
66
79
|
return final_report
|
|
67
80
|
|
|
68
81
|
|
|
69
|
-
def _extract_a_tags(soup, silent):
|
|
82
|
+
def _extract_a_tags(soup, silent, url=None):
|
|
70
83
|
a_links = []
|
|
71
84
|
if not silent:
|
|
72
85
|
print("\n\n\n\n---- all <a> links ---")
|
|
@@ -74,12 +87,18 @@ def _extract_a_tags(soup, silent):
|
|
|
74
87
|
i = 0
|
|
75
88
|
for a_tag in soup.find_all("a"):
|
|
76
89
|
i = i + 1
|
|
77
|
-
|
|
90
|
+
|
|
91
|
+
href = a_tag.get("href")
|
|
92
|
+
if url is not None and href[:1] == "/":
|
|
93
|
+
domain = url.split("//")[0] + "//" + url.split("//")[1].split("/")[0]
|
|
94
|
+
href = domain + href
|
|
95
|
+
|
|
96
|
+
a_links.append({"text": a_tag.text, "href": href})
|
|
78
97
|
if not silent:
|
|
79
98
|
print("\n-- <a> link", i, "-- ")
|
|
80
99
|
print("target:", a_tag.get("target"))
|
|
81
100
|
print("text:", str(a_tag.text).strip())
|
|
82
|
-
print("href:",
|
|
101
|
+
print("href:", href)
|
|
83
102
|
print("-------------- ")
|
|
84
103
|
return a_links
|
|
85
104
|
|
|
@@ -114,9 +133,21 @@ def _extract_text_tags(soup, silent, body_tags_to_search):
|
|
|
114
133
|
print("\n\n\n\n---- all <text> tags ---\n")
|
|
115
134
|
i = 0
|
|
116
135
|
for text_tag, separator in body_tags_to_search.items():
|
|
117
|
-
|
|
136
|
+
tag = text_tag
|
|
137
|
+
tag_class = None
|
|
138
|
+
tag_id = None
|
|
139
|
+
|
|
140
|
+
if len(text_tag.split(".")) > 1:
|
|
141
|
+
tag = text_tag.split(".")[0]
|
|
142
|
+
tag_class = text_tag.split(".")[1]
|
|
143
|
+
|
|
144
|
+
if len(text_tag.split("#")) > 1:
|
|
145
|
+
tag = text_tag.split("#")[0]
|
|
146
|
+
tag_id = text_tag.split("#")[1]
|
|
147
|
+
|
|
148
|
+
if len(soup.find_all(tag, class_=tag_class, id=tag_id)) > 0:
|
|
118
149
|
valid_text_tags[text_tag] = []
|
|
119
|
-
for obj_tag in soup.find_all(
|
|
150
|
+
for obj_tag in soup.find_all(tag, class_=tag_class, id=tag_id):
|
|
120
151
|
valid_text_tags[text_tag].append(obj_tag.text.strip())
|
|
121
152
|
valid_text_tags[text_tag] = separator.join(valid_text_tags[text_tag])
|
|
122
153
|
i = i + 1
|
|
@@ -151,5 +182,23 @@ def _complementary_report(final_report, soup, silent):
|
|
|
151
182
|
return final_report
|
|
152
183
|
|
|
153
184
|
|
|
154
|
-
def get_tags(url, sniffing_config={}):
|
|
155
|
-
return sniff_url(
|
|
185
|
+
def get_tags(url, sniffing_config={}, driver=None):
|
|
186
|
+
return sniff_url(
|
|
187
|
+
url=url, silent=True, sniffing_config=sniffing_config, driver=driver
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def get_url(url, driver=None):
|
|
192
|
+
if driver is None and config.get_sniffing("use-browser"):
|
|
193
|
+
driver = browser.get_driver()
|
|
194
|
+
|
|
195
|
+
if driver is not None:
|
|
196
|
+
try:
|
|
197
|
+
driver.get(url)
|
|
198
|
+
time.sleep(config.get_sniffing("browser-waiting-time"))
|
|
199
|
+
driver.implicitly_wait(config.get_sniffing("browser-waiting-time"))
|
|
200
|
+
return driver.page_source
|
|
201
|
+
except:
|
|
202
|
+
print("error")
|
|
203
|
+
pass
|
|
204
|
+
return requests.get(url=url, timeout=config.get_sniffing("timeout")).text
|
|
File without changes
|
{ohmyscrapper-0.7.1 → ohmyscrapper-0.8.1}/src/ohmyscrapper/core/default_files/url_types.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|