ohmyscrapper 0.7.4__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ohmyscrapper/__init__.py +32 -9
- ohmyscrapper/core/config.py +3 -2
- ohmyscrapper/core/config_files.py +14 -10
- ohmyscrapper/core/default_files/config.yaml +9 -5
- ohmyscrapper/core/default_files/url_sniffing.yaml +1 -1
- ohmyscrapper/models/urls_manager.py +5 -3
- ohmyscrapper/modules/browser.py +28 -0
- ohmyscrapper/modules/cache.py +100 -0
- ohmyscrapper/modules/classify_urls.py +4 -2
- ohmyscrapper/modules/load_txt.py +2 -0
- ohmyscrapper/modules/process_with_ai.py +10 -11
- ohmyscrapper/modules/scrap_urls.py +18 -9
- ohmyscrapper/modules/show.py +4 -2
- ohmyscrapper/modules/sniff_url.py +67 -12
- {ohmyscrapper-0.7.4.dist-info → ohmyscrapper-0.8.4.dist-info}/METADATA +4 -3
- ohmyscrapper-0.8.4.dist-info/RECORD +23 -0
- {ohmyscrapper-0.7.4.dist-info → ohmyscrapper-0.8.4.dist-info}/WHEEL +2 -2
- ohmyscrapper-0.7.4.dist-info/RECORD +0 -21
- {ohmyscrapper-0.7.4.dist-info → ohmyscrapper-0.8.4.dist-info}/entry_points.txt +0 -0
ohmyscrapper/__init__.py
CHANGED
|
@@ -1,26 +1,27 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
|
|
3
|
+
from ohmyscrapper.core.config import update
|
|
4
|
+
from ohmyscrapper.modules import cache
|
|
3
5
|
from ohmyscrapper.modules.classify_urls import classify_urls
|
|
4
|
-
from ohmyscrapper.modules.sniff_url import sniff_url
|
|
5
6
|
from ohmyscrapper.modules.load_txt import load_txt
|
|
6
|
-
from ohmyscrapper.modules.
|
|
7
|
+
from ohmyscrapper.modules.merge_dbs import merge_dbs
|
|
8
|
+
from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
|
|
7
9
|
from ohmyscrapper.modules.scrap_urls import scrap_urls
|
|
10
|
+
from ohmyscrapper.modules.seed import export_url_types_to_file, seed
|
|
8
11
|
from ohmyscrapper.modules.show import (
|
|
12
|
+
export_report,
|
|
13
|
+
export_urls,
|
|
9
14
|
show_url,
|
|
10
15
|
show_urls,
|
|
11
16
|
show_urls_valid_prefix,
|
|
12
|
-
export_urls,
|
|
13
|
-
export_report,
|
|
14
17
|
)
|
|
18
|
+
from ohmyscrapper.modules.sniff_url import sniff_url
|
|
15
19
|
from ohmyscrapper.modules.untouch_all import untouch_all
|
|
16
|
-
from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
|
|
17
|
-
from ohmyscrapper.modules.merge_dbs import merge_dbs
|
|
18
|
-
from ohmyscrapper.core.config import update
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
def main():
|
|
22
23
|
parser = argparse.ArgumentParser(prog="ohmyscrapper")
|
|
23
|
-
parser.add_argument("--version", action="version", version="%(prog)s v0.
|
|
24
|
+
parser.add_argument("--version", action="version", version="%(prog)s v0.8.4")
|
|
24
25
|
|
|
25
26
|
update()
|
|
26
27
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
@@ -104,6 +105,16 @@ def main():
|
|
|
104
105
|
sniff_url_parser.add_argument(
|
|
105
106
|
"url", default="https://cesarcardoso.cc/", help="Url to sniff"
|
|
106
107
|
)
|
|
108
|
+
sniff_url_parser.add_argument(
|
|
109
|
+
"--metatags",
|
|
110
|
+
default="mt",
|
|
111
|
+
help="Meta tags you want to watch separated by comma ','",
|
|
112
|
+
)
|
|
113
|
+
sniff_url_parser.add_argument(
|
|
114
|
+
"--bodytags",
|
|
115
|
+
default="bd",
|
|
116
|
+
help="Body tags you want to watch separated by comma ','",
|
|
117
|
+
)
|
|
107
118
|
|
|
108
119
|
show_urls_parser = subparsers.add_parser("show", help="Show urls and prefixes")
|
|
109
120
|
show_urls_parser.add_argument(
|
|
@@ -131,6 +142,7 @@ def main():
|
|
|
131
142
|
)
|
|
132
143
|
merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
|
|
133
144
|
|
|
145
|
+
clean_cache_parser = subparsers.add_parser("cleancache", help="Clean cache.")
|
|
134
146
|
args = parser.parse_args()
|
|
135
147
|
|
|
136
148
|
if args.command == "classify-urls":
|
|
@@ -153,7 +165,15 @@ def main():
|
|
|
153
165
|
return
|
|
154
166
|
|
|
155
167
|
if args.command == "sniff-url":
|
|
156
|
-
|
|
168
|
+
sniffing_config = {}
|
|
169
|
+
if len(args.metatags) > 0:
|
|
170
|
+
sniffing_config["metatags"] = str(args.metatags).split(",")
|
|
171
|
+
|
|
172
|
+
if len(args.bodytags) > 0:
|
|
173
|
+
sniffing_config["bodytags"] = str(args.bodytags).split(",")
|
|
174
|
+
|
|
175
|
+
sniff_url(args.url, sniffing_config=sniffing_config)
|
|
176
|
+
|
|
157
177
|
return
|
|
158
178
|
|
|
159
179
|
if args.command == "scrap-urls":
|
|
@@ -218,6 +238,9 @@ def main():
|
|
|
218
238
|
export_report()
|
|
219
239
|
return
|
|
220
240
|
|
|
241
|
+
if args.command == "cleancache":
|
|
242
|
+
cache.clean()
|
|
243
|
+
|
|
221
244
|
|
|
222
245
|
if __name__ == "__main__":
|
|
223
246
|
main()
|
ohmyscrapper/core/config.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
|
|
2
3
|
from ohmyscrapper.core import config_files
|
|
3
4
|
|
|
4
5
|
default_app_dir = "ohmyscrapper"
|
|
@@ -69,14 +70,14 @@ def url_types_file_exists():
|
|
|
69
70
|
def get_url_types():
|
|
70
71
|
url_types_file = get_files("url_types")
|
|
71
72
|
return config_files.create_and_read_config_file(
|
|
72
|
-
url_types_file, default_app_dir=default_app_dir
|
|
73
|
+
url_types_file, default_app_dir=default_app_dir, complete_file=False
|
|
73
74
|
)
|
|
74
75
|
|
|
75
76
|
|
|
76
77
|
def get_url_sniffing():
|
|
77
78
|
file = get_files("url_sniffing")
|
|
78
79
|
return config_files.create_and_read_config_file(
|
|
79
|
-
file, default_app_dir=default_app_dir
|
|
80
|
+
file, default_app_dir=default_app_dir, complete_file=False
|
|
80
81
|
)
|
|
81
82
|
|
|
82
83
|
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import os
|
|
2
|
+
|
|
2
3
|
import yaml
|
|
3
4
|
|
|
4
5
|
|
|
5
|
-
def create_and_read_config_file(
|
|
6
|
+
def create_and_read_config_file(
|
|
7
|
+
file_name, default_app_dir, force_default=False, complete_file=True
|
|
8
|
+
):
|
|
6
9
|
config_file = config_file_path(file_name, default_app_dir)
|
|
7
10
|
default_config_params = _get_default_file(default_file=file_name)
|
|
8
11
|
if force_default or not os.path.exists(config_file):
|
|
@@ -15,17 +18,18 @@ def create_and_read_config_file(file_name, default_app_dir, force_default=False)
|
|
|
15
18
|
else:
|
|
16
19
|
with open(config_file, "r") as f:
|
|
17
20
|
config_params = yaml.safe_load(f.read())
|
|
18
|
-
if
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
default_app_dir=default_app_dir,
|
|
23
|
-
):
|
|
24
|
-
config_params = create_and_read_config_file(
|
|
21
|
+
if complete_file:
|
|
22
|
+
if complete_config_file(
|
|
23
|
+
config_params=config_params,
|
|
24
|
+
default_config_params=default_config_params,
|
|
25
25
|
file_name=file_name,
|
|
26
26
|
default_app_dir=default_app_dir,
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
):
|
|
28
|
+
config_params = create_and_read_config_file(
|
|
29
|
+
file_name=file_name,
|
|
30
|
+
default_app_dir=default_app_dir,
|
|
31
|
+
force_default=force_default,
|
|
32
|
+
)
|
|
29
33
|
|
|
30
34
|
if config_params is None:
|
|
31
35
|
config_params = create_and_read_config_file(
|
|
@@ -2,11 +2,12 @@ db:
|
|
|
2
2
|
db_file: local.db
|
|
3
3
|
|
|
4
4
|
default_dirs:
|
|
5
|
-
db: ./
|
|
6
|
-
input: ./
|
|
7
|
-
output: ./
|
|
8
|
-
prompts: ./
|
|
9
|
-
templates: ./
|
|
5
|
+
db: ./ohmyscrapper_db
|
|
6
|
+
input: ./ohmyscrapper_input
|
|
7
|
+
output: ./ohmyscrapper_output
|
|
8
|
+
prompts: ./ohmyscrapper_prompts
|
|
9
|
+
templates: ./ohmyscrapper_templates
|
|
10
|
+
cache: ./ohmyscrapper_cache
|
|
10
11
|
|
|
11
12
|
default_files:
|
|
12
13
|
url_types: url_types.yaml
|
|
@@ -17,3 +18,6 @@ ai:
|
|
|
17
18
|
|
|
18
19
|
sniffing:
|
|
19
20
|
timeout: 10
|
|
21
|
+
use-browser: false
|
|
22
|
+
browser-waiting-time: 5
|
|
23
|
+
round-sleeping: 10
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import os
|
|
2
3
|
import sqlite3
|
|
3
4
|
import time
|
|
4
|
-
import glob
|
|
5
|
-
import pandas as pd
|
|
6
5
|
from urllib.parse import urlparse, urlunparse
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
7
9
|
from ohmyscrapper.core import config
|
|
8
10
|
|
|
9
11
|
|
|
@@ -71,7 +73,7 @@ def seeds(seeds={}):
|
|
|
71
73
|
|
|
72
74
|
@use_connection
|
|
73
75
|
def reset_seeds():
|
|
74
|
-
sql = "DELETE FROM urls_valid_prefix"
|
|
76
|
+
sql = "DELETE FROM urls_valid_prefix WHERE 1 = 1"
|
|
75
77
|
c = conn.cursor()
|
|
76
78
|
c.execute(sql)
|
|
77
79
|
conn.commit()
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from selenium import webdriver
|
|
2
|
+
|
|
3
|
+
from ohmyscrapper.core.config import get_sniffing
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_driver():
|
|
7
|
+
if get_sniffing("use-browser") == "safari":
|
|
8
|
+
from selenium.webdriver.safari.options import Options
|
|
9
|
+
|
|
10
|
+
options = Options()
|
|
11
|
+
driver = webdriver.Safari(options=options)
|
|
12
|
+
elif get_sniffing("use-browser") == "firefox":
|
|
13
|
+
from selenium.webdriver.firefox.options import Options
|
|
14
|
+
|
|
15
|
+
options = Options()
|
|
16
|
+
driver = webdriver.Firefox(options=options)
|
|
17
|
+
elif get_sniffing("use-browser") == "ie":
|
|
18
|
+
from selenium.webdriver.ie.options import Options
|
|
19
|
+
|
|
20
|
+
options = Options()
|
|
21
|
+
driver = webdriver.Ie(options=options)
|
|
22
|
+
else: # default: chrome
|
|
23
|
+
from selenium.webdriver.chrome.options import Options
|
|
24
|
+
|
|
25
|
+
options = Options()
|
|
26
|
+
driver = webdriver.Chrome(options=options)
|
|
27
|
+
|
|
28
|
+
return driver
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from ohmyscrapper.core import config
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def safe_cache_id(func):
|
|
7
|
+
def _filter_cache_id(*args, **kwargs):
|
|
8
|
+
if "cache_id" in args:
|
|
9
|
+
args["cache_id"] = filter_cache_id(args["cache_id"])
|
|
10
|
+
|
|
11
|
+
if "cache_id" in kwargs:
|
|
12
|
+
kwargs["cache_id"] = filter_cache_id(kwargs["cache_id"])
|
|
13
|
+
|
|
14
|
+
return func(*args, **kwargs)
|
|
15
|
+
|
|
16
|
+
return _filter_cache_id
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def filter_cache_id(cache_id):
|
|
20
|
+
cache_id = cache_id.replace('"', "").replace("\\", "")
|
|
21
|
+
cache_id = f'"{cache_id}"'
|
|
22
|
+
return cache_id
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
cache_files_extension = "html"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@safe_cache_id
|
|
29
|
+
def set(text: str, cache_id: str):
|
|
30
|
+
cache_folder = config.get_dir("cache")
|
|
31
|
+
cache_index_file_path = get_cache_index_path()
|
|
32
|
+
|
|
33
|
+
cache_folder_files = os.listdir(cache_folder)
|
|
34
|
+
cached_file_index = get_index_from_file_index(_safe_cache_id=cache_id)
|
|
35
|
+
if cached_file_index is not None:
|
|
36
|
+
new_file_index = cached_file_index
|
|
37
|
+
else:
|
|
38
|
+
new_file_index = len(cache_folder_files)
|
|
39
|
+
with open(cache_index_file_path, "a") as cache_index_file_writer:
|
|
40
|
+
cache_index_file_writer.write(f"\n{new_file_index}: {cache_id}")
|
|
41
|
+
|
|
42
|
+
new_file_name = f"{new_file_index}.{cache_files_extension}"
|
|
43
|
+
new_file_path = os.path.join(cache_folder, new_file_name)
|
|
44
|
+
with open(new_file_path, "w+") as new_file_writer:
|
|
45
|
+
new_file_writer.write(text)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@safe_cache_id
|
|
49
|
+
def get(cache_id: str) -> str:
|
|
50
|
+
cached_file_index = get_index_from_file_index(_safe_cache_id=cache_id)
|
|
51
|
+
code = get_cached_file_by_index(cached_file_index=cached_file_index)
|
|
52
|
+
return code
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_index_from_file_index(_safe_cache_id):
|
|
56
|
+
cache_index_file = get_cache_index_file()
|
|
57
|
+
if cache_index_file.find(_safe_cache_id) < 1:
|
|
58
|
+
return None
|
|
59
|
+
cache_index_file = cache_index_file[: cache_index_file.find(_safe_cache_id) - 2]
|
|
60
|
+
cached_file_index = int(cache_index_file.split("\n")[-1].strip())
|
|
61
|
+
return cached_file_index
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_cache_index_path() -> str:
|
|
65
|
+
cache_index_file_name = "cache_index.yaml"
|
|
66
|
+
cache_folder = config.get_dir("cache")
|
|
67
|
+
cache_index_file_path = os.path.join(cache_folder, cache_index_file_name)
|
|
68
|
+
if not os.path.exists(cache_index_file_path):
|
|
69
|
+
with open(cache_index_file_path, "w+") as cache_index_file_writer:
|
|
70
|
+
cache_index_file_writer.write(f"0: {cache_index_file_name}")
|
|
71
|
+
|
|
72
|
+
return cache_index_file_path
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_cache_index_file() -> str:
|
|
76
|
+
with open(get_cache_index_path(), "r") as f:
|
|
77
|
+
cache_index_file_content = f.read()
|
|
78
|
+
|
|
79
|
+
return cache_index_file_content
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def get_cached_file_by_index(cached_file_index: int) -> str:
|
|
83
|
+
code = None
|
|
84
|
+
cache_folder = config.get_dir("cache")
|
|
85
|
+
cached_file_name = f"{cached_file_index}.{cache_files_extension}"
|
|
86
|
+
cached_file_path = os.path.join(cache_folder, cached_file_name)
|
|
87
|
+
if not os.path.exists(cached_file_path):
|
|
88
|
+
return None
|
|
89
|
+
with open(cached_file_path, "r") as cached_file_reader:
|
|
90
|
+
code = cached_file_reader.read()
|
|
91
|
+
return code
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def clean():
|
|
95
|
+
cache_folder = config.get_dir("cache")
|
|
96
|
+
cache_folder_files = os.listdir(cache_folder)
|
|
97
|
+
for file in cache_folder_files:
|
|
98
|
+
file_to_clean = os.path.join(cache_folder, file)
|
|
99
|
+
if os.path.exists(file_to_clean):
|
|
100
|
+
os.remove(file_to_clean)
|
ohmyscrapper/modules/load_txt.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
from bs4 import BeautifulSoup
|
|
4
|
-
from google import genai
|
|
5
|
-
from dotenv import load_dotenv
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
6
3
|
import random
|
|
7
4
|
import time
|
|
8
|
-
|
|
5
|
+
|
|
9
6
|
import yaml
|
|
10
|
-
import
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
from google import genai
|
|
10
|
+
|
|
11
|
+
import ohmyscrapper.models.urls_manager as urls_manager
|
|
12
|
+
from ohmyscrapper.core import config
|
|
11
13
|
|
|
12
14
|
# TODO: !!! REFACTOR !!!
|
|
13
15
|
load_dotenv()
|
|
@@ -85,15 +87,12 @@ def process_with_ai(recursive=True, triggered_times=0):
|
|
|
85
87
|
|
|
86
88
|
texts = ""
|
|
87
89
|
for index, row in df.iterrows():
|
|
88
|
-
texts =
|
|
89
|
-
texts
|
|
90
|
-
+ f"""
|
|
90
|
+
texts = texts + f"""
|
|
91
91
|
<text>
|
|
92
92
|
<id>{str(row['id'])}</id>
|
|
93
93
|
{row['description']}
|
|
94
94
|
</text>
|
|
95
95
|
"""
|
|
96
|
-
)
|
|
97
96
|
if texts == "":
|
|
98
97
|
print("no urls to process")
|
|
99
98
|
return
|
|
@@ -1,14 +1,15 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import time
|
|
3
|
+
|
|
1
4
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
-
import ohmyscrapper.modules.
|
|
3
|
-
import ohmyscrapper.modules.load_txt as load_txt
|
|
5
|
+
import ohmyscrapper.modules.browser as browser
|
|
4
6
|
import ohmyscrapper.modules.classify_urls as classify_urls
|
|
7
|
+
import ohmyscrapper.modules.load_txt as load_txt
|
|
8
|
+
import ohmyscrapper.modules.sniff_url as sniff_url
|
|
5
9
|
from ohmyscrapper.core import config
|
|
6
10
|
|
|
7
|
-
import time
|
|
8
|
-
import random
|
|
9
|
-
|
|
10
11
|
|
|
11
|
-
def scrap_url(url, verbose=False):
|
|
12
|
+
def scrap_url(url, verbose=False, driver=None):
|
|
12
13
|
if url["url_type"] is None:
|
|
13
14
|
url["url_type"] = "generic"
|
|
14
15
|
|
|
@@ -32,7 +33,7 @@ def scrap_url(url, verbose=False):
|
|
|
32
33
|
sniffing_config = config.get_url_sniffing()
|
|
33
34
|
|
|
34
35
|
url_report = sniff_url.get_tags(
|
|
35
|
-
url=url["url"], sniffing_config=sniffing_config[url_type]
|
|
36
|
+
url=url["url"], sniffing_config=sniffing_config[url_type], driver=driver
|
|
36
37
|
)
|
|
37
38
|
except Exception as e:
|
|
38
39
|
urls_manager.set_url_error(url=url["url"], value="error on scrapping")
|
|
@@ -147,6 +148,7 @@ def scrap_urls(
|
|
|
147
148
|
only_parents=True,
|
|
148
149
|
verbose=False,
|
|
149
150
|
n_urls=0,
|
|
151
|
+
driver=None,
|
|
150
152
|
):
|
|
151
153
|
limit = 10
|
|
152
154
|
classify_urls.classify_urls()
|
|
@@ -170,13 +172,19 @@ def scrap_urls(
|
|
|
170
172
|
time.sleep(wait)
|
|
171
173
|
|
|
172
174
|
print("🐕 Scrapper is sniffing the url...")
|
|
173
|
-
|
|
175
|
+
|
|
176
|
+
if driver is None and config.get_sniffing("use-browser"):
|
|
177
|
+
driver = browser.get_driver()
|
|
178
|
+
scrap_url(url=url, verbose=verbose, driver=driver)
|
|
174
179
|
|
|
175
180
|
n_urls = n_urls + len(urls)
|
|
176
181
|
print(f"-- 🗃️ {n_urls} scraped urls...")
|
|
177
182
|
classify_urls.classify_urls()
|
|
178
183
|
if recursive:
|
|
179
|
-
wait = random.randint(
|
|
184
|
+
wait = random.randint(
|
|
185
|
+
int(config.get_sniffing("round-sleeping") / 2),
|
|
186
|
+
int(config.get_sniffing("round-sleeping")),
|
|
187
|
+
)
|
|
180
188
|
print(
|
|
181
189
|
f"🐶 Scrapper is sleeping for {wait} seconds before next round of {limit} urls"
|
|
182
190
|
)
|
|
@@ -188,6 +196,7 @@ def scrap_urls(
|
|
|
188
196
|
only_parents=only_parents,
|
|
189
197
|
verbose=verbose,
|
|
190
198
|
n_urls=n_urls,
|
|
199
|
+
driver=driver,
|
|
191
200
|
)
|
|
192
201
|
else:
|
|
193
202
|
print("scrapping is over...")
|
ohmyscrapper/modules/show.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
|
-
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
-
from ohmyscrapper.core import config
|
|
3
1
|
import math
|
|
4
2
|
import os
|
|
3
|
+
|
|
5
4
|
from rich.console import Console
|
|
6
5
|
from rich.table import Table
|
|
7
6
|
|
|
7
|
+
import ohmyscrapper.models.urls_manager as urls_manager
|
|
8
|
+
from ohmyscrapper.core import config
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
|
|
10
12
|
output_folder = config.get_dir("output")
|
|
@@ -1,16 +1,22 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
|
|
1
5
|
import requests
|
|
2
6
|
from bs4 import BeautifulSoup
|
|
3
|
-
|
|
7
|
+
|
|
8
|
+
import ohmyscrapper.modules.browser as browser
|
|
4
9
|
from ohmyscrapper.core import config
|
|
10
|
+
from ohmyscrapper.modules import cache
|
|
5
11
|
|
|
6
12
|
|
|
7
13
|
def sniff_url(
|
|
8
14
|
url="https://www.linkedin.com/in/cesardesouzacardoso/",
|
|
9
15
|
silent=False,
|
|
10
16
|
sniffing_config={},
|
|
17
|
+
driver=None,
|
|
11
18
|
):
|
|
12
19
|
final_report = {}
|
|
13
|
-
final_report["error"] = None
|
|
14
20
|
if "metatags" in sniffing_config:
|
|
15
21
|
metatags_to_search = sniffing_config["metatags"]
|
|
16
22
|
else:
|
|
@@ -45,13 +51,14 @@ def sniff_url(
|
|
|
45
51
|
print("checking url:", url)
|
|
46
52
|
|
|
47
53
|
try:
|
|
48
|
-
r =
|
|
49
|
-
soup = BeautifulSoup(r
|
|
54
|
+
r = get_url(url=url, driver=driver)
|
|
55
|
+
soup = BeautifulSoup(r, "html.parser")
|
|
50
56
|
except requests.exceptions.ReadTimeout:
|
|
51
57
|
url_domain = url.split("/")[2]
|
|
52
58
|
final_report["error"] = (
|
|
53
59
|
f"!!! timeout (10 seconds) while checking the url with domain: `{url_domain}` !!!"
|
|
54
60
|
)
|
|
61
|
+
|
|
55
62
|
print(f"\n\n{final_report['error']}\n\n")
|
|
56
63
|
soup = BeautifulSoup("", "html.parser")
|
|
57
64
|
|
|
@@ -69,14 +76,14 @@ def sniff_url(
|
|
|
69
76
|
soup=soup, silent=silent, body_tags_to_search=body_tags_to_search
|
|
70
77
|
)
|
|
71
78
|
)
|
|
72
|
-
final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent)
|
|
79
|
+
final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent, url=url)
|
|
73
80
|
final_report = _complementary_report(final_report, soup, silent).copy()
|
|
74
81
|
final_report["json"] = json.dumps(final_report)
|
|
75
82
|
|
|
76
83
|
return final_report
|
|
77
84
|
|
|
78
85
|
|
|
79
|
-
def _extract_a_tags(soup, silent):
|
|
86
|
+
def _extract_a_tags(soup, silent, url=None):
|
|
80
87
|
a_links = []
|
|
81
88
|
if not silent:
|
|
82
89
|
print("\n\n\n\n---- all <a> links ---")
|
|
@@ -84,12 +91,19 @@ def _extract_a_tags(soup, silent):
|
|
|
84
91
|
i = 0
|
|
85
92
|
for a_tag in soup.find_all("a"):
|
|
86
93
|
i = i + 1
|
|
87
|
-
|
|
94
|
+
|
|
95
|
+
href = a_tag.get("href")
|
|
96
|
+
if href is not None:
|
|
97
|
+
if url is not None and href[:1] == "/":
|
|
98
|
+
domain = url.split("//")[0] + "//" + url.split("//")[1].split("/")[0]
|
|
99
|
+
href = domain + href
|
|
100
|
+
|
|
101
|
+
a_links.append({"text": a_tag.text, "href": href})
|
|
88
102
|
if not silent:
|
|
89
103
|
print("\n-- <a> link", i, "-- ")
|
|
90
104
|
print("target:", a_tag.get("target"))
|
|
91
105
|
print("text:", str(a_tag.text).strip())
|
|
92
|
-
print("href:",
|
|
106
|
+
print("href:", href)
|
|
93
107
|
print("-------------- ")
|
|
94
108
|
return a_links
|
|
95
109
|
|
|
@@ -124,9 +138,21 @@ def _extract_text_tags(soup, silent, body_tags_to_search):
|
|
|
124
138
|
print("\n\n\n\n---- all <text> tags ---\n")
|
|
125
139
|
i = 0
|
|
126
140
|
for text_tag, separator in body_tags_to_search.items():
|
|
127
|
-
|
|
141
|
+
tag = text_tag
|
|
142
|
+
tag_class = None
|
|
143
|
+
tag_id = None
|
|
144
|
+
|
|
145
|
+
if len(text_tag.split(".")) > 1:
|
|
146
|
+
tag = text_tag.split(".")[0]
|
|
147
|
+
tag_class = text_tag.split(".")[1]
|
|
148
|
+
|
|
149
|
+
if len(text_tag.split("#")) > 1:
|
|
150
|
+
tag = text_tag.split("#")[0]
|
|
151
|
+
tag_id = text_tag.split("#")[1]
|
|
152
|
+
|
|
153
|
+
if len(soup.find_all(tag, class_=tag_class, id=tag_id)) > 0:
|
|
128
154
|
valid_text_tags[text_tag] = []
|
|
129
|
-
for obj_tag in soup.find_all(
|
|
155
|
+
for obj_tag in soup.find_all(tag, class_=tag_class, id=tag_id):
|
|
130
156
|
valid_text_tags[text_tag].append(obj_tag.text.strip())
|
|
131
157
|
valid_text_tags[text_tag] = separator.join(valid_text_tags[text_tag])
|
|
132
158
|
i = i + 1
|
|
@@ -161,5 +187,34 @@ def _complementary_report(final_report, soup, silent):
|
|
|
161
187
|
return final_report
|
|
162
188
|
|
|
163
189
|
|
|
164
|
-
def get_tags(url, sniffing_config={}):
|
|
165
|
-
return sniff_url(
|
|
190
|
+
def get_tags(url, sniffing_config={}, driver=None):
|
|
191
|
+
return sniff_url(
|
|
192
|
+
url=url, silent=True, sniffing_config=sniffing_config, driver=driver
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def get_url(url, driver=None):
|
|
197
|
+
cache_prefix = "sniff-urf:"
|
|
198
|
+
cached_code = cache.get(cache_id=cache_prefix + url)
|
|
199
|
+
|
|
200
|
+
if cached_code is not None:
|
|
201
|
+
print("You used the cache for this URL.")
|
|
202
|
+
return cached_code
|
|
203
|
+
|
|
204
|
+
if driver is None and config.get_sniffing("use-browser"):
|
|
205
|
+
driver = browser.get_driver()
|
|
206
|
+
|
|
207
|
+
if driver is not None:
|
|
208
|
+
try:
|
|
209
|
+
driver.get(url)
|
|
210
|
+
time.sleep(config.get_sniffing("browser-waiting-time"))
|
|
211
|
+
driver.implicitly_wait(config.get_sniffing("browser-waiting-time"))
|
|
212
|
+
code = driver.page_source
|
|
213
|
+
cache.set(text=code, cache_id=cache_prefix + url)
|
|
214
|
+
return code
|
|
215
|
+
except:
|
|
216
|
+
print("error")
|
|
217
|
+
pass
|
|
218
|
+
code = requests.get(url=url, timeout=config.get_sniffing("timeout")).text
|
|
219
|
+
cache.set(text=code, cache_id=cache_prefix + url)
|
|
220
|
+
return code
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ohmyscrapper
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.4
|
|
4
4
|
Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
|
|
5
5
|
Author: Cesar Cardoso
|
|
6
6
|
Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
|
|
@@ -13,13 +13,14 @@ Requires-Dist: python-dotenv>=1.2.1
|
|
|
13
13
|
Requires-Dist: pyyaml>=6.0.3
|
|
14
14
|
Requires-Dist: requests>=2.32.5
|
|
15
15
|
Requires-Dist: rich>=14.2.0
|
|
16
|
+
Requires-Dist: selenium>=4.39.0
|
|
16
17
|
Requires-Dist: urlextract>=1.9.0
|
|
17
18
|
Requires-Python: >=3.11
|
|
18
|
-
Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
|
|
19
19
|
Project-URL: Repository, https://github.com/bouli/ohmyscrapper
|
|
20
|
+
Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
|
|
20
21
|
Description-Content-Type: text/markdown
|
|
21
22
|
|
|
22
|
-
# 🐶 OhMyScrapper - v0.
|
|
23
|
+
# 🐶 OhMyScrapper - v0.8.4
|
|
23
24
|
|
|
24
25
|
OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
|
|
25
26
|
final report with general information about job positions.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
ohmyscrapper/__init__.py,sha256=C_nhLCKrLogCLQVVHlTJNMEOSFiLDTR0bBCtW8T8kXE,7859
|
|
2
|
+
ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
|
|
3
|
+
ohmyscrapper/core/config.py,sha256=2S8iYMdN5-fCX4EW1cmSd4-XabzqxIgbupBuymV3yjY,3185
|
|
4
|
+
ohmyscrapper/core/config_files.py,sha256=5FyPFpN7WQrlgQWr85s5NF-UbnzbyFsWEVVMOs8iyaw,3411
|
|
5
|
+
ohmyscrapper/core/default_files/config.yaml,sha256=ETuTyFM1fedjehM9cZRoKxRKcYDH0LlPFAZ3vLj0uxU,436
|
|
6
|
+
ohmyscrapper/core/default_files/url_sniffing.yaml,sha256=HUwmGUwuJy7t97bJHgNiZOl1thvD9bLaelPgbEr5bMY,465
|
|
7
|
+
ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
|
|
8
|
+
ohmyscrapper/models/urls_manager.py,sha256=XC8HODdsCEo_nn1j7nH_jy9AUTb4PpmkGlaFWV048TM,12117
|
|
9
|
+
ohmyscrapper/modules/browser.py,sha256=pH41NVqYgay_zEIZfncJbtwz_13REX5HVH8uk581sM4,857
|
|
10
|
+
ohmyscrapper/modules/cache.py,sha256=3EQnv9VYJWrE5fdLwkGEUOAHV16nprhyid6MlBpa9Gg,3228
|
|
11
|
+
ohmyscrapper/modules/classify_urls.py,sha256=oK_UhQPF976cexlarqi14pSw8tWLGYfaIMCXzbAhnpI,1040
|
|
12
|
+
ohmyscrapper/modules/load_txt.py,sha256=Gpob1W_LLfkBnNbtqxgCRNGeyufmHECreDqTlj9O_Mk,4140
|
|
13
|
+
ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
|
|
14
|
+
ohmyscrapper/modules/process_with_ai.py,sha256=TqebqC3_rCx6cbvq3oQhaXLZxGUYpKvhyH3I3zjsA94,7221
|
|
15
|
+
ohmyscrapper/modules/scrap_urls.py,sha256=affq5Vx5BKrl7uL2mpcThDBOXznq0d5fz1if5xAttOA,6627
|
|
16
|
+
ohmyscrapper/modules/seed.py,sha256=hHEGSoPXsmclTaRPeIcK2oC1Xpg3_JqBv_YFMD0m5Jw,1044
|
|
17
|
+
ohmyscrapper/modules/show.py,sha256=i5l8_Zooj6vg1JLqWtvGPWHv7wL53aHZ43-SKS1sF9Y,3879
|
|
18
|
+
ohmyscrapper/modules/sniff_url.py,sha256=NpIMJxNEUzmDkFGVqDJXgVtTWEGKRE_dSiJHNz-vXoE,7027
|
|
19
|
+
ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
|
|
20
|
+
ohmyscrapper-0.8.4.dist-info/WHEEL,sha256=fAguSjoiATBe7TNBkJwOjyL1Tt4wwiaQGtNtjRPNMQA,80
|
|
21
|
+
ohmyscrapper-0.8.4.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
|
|
22
|
+
ohmyscrapper-0.8.4.dist-info/METADATA,sha256=h2Agb2KCKiBkX-HEj_8f9EuV3NOq6AJ9h1WrnPQ79iU,4293
|
|
23
|
+
ohmyscrapper-0.8.4.dist-info/RECORD,,
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
ohmyscrapper/__init__.py,sha256=x3wLMhIU744W9DRtXoTrPpWghb7UdC3UJSYZh_gpzlw,7095
|
|
2
|
-
ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
|
|
3
|
-
ohmyscrapper/core/config.py,sha256=aaSLxk6Fuzp88EMax6MAOX3WszH4OfYLz_dJoXlu0ME,3142
|
|
4
|
-
ohmyscrapper/core/config_files.py,sha256=C79-Vgz1E5_jUWtob-yrCyBxsqWEXxqPI_r6TL7D1_Q,3314
|
|
5
|
-
ohmyscrapper/core/default_files/config.yaml,sha256=gi8tqhSumQYJIl8QDisJ6eaib2tdcBNT-GFU-e6Dtns,273
|
|
6
|
-
ohmyscrapper/core/default_files/url_sniffing.yaml,sha256=RU5GYWmC1PdBl4nn7HUfRBwuXz8Rlap75d4W3zWDzPM,465
|
|
7
|
-
ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
|
|
8
|
-
ohmyscrapper/models/urls_manager.py,sha256=k0N1If4YoRUWHX80OyBNEeJNIzDROc2ur6j8q2OBlqo,12103
|
|
9
|
-
ohmyscrapper/modules/classify_urls.py,sha256=GhiosAQUITy1DQe_PksYV9QRKVTgpkSE28dkutzbWVA,1038
|
|
10
|
-
ohmyscrapper/modules/load_txt.py,sha256=pkWBIdh6vORPfENDZ6wGM89vswnOnc1flqKfkLs9RD8,4138
|
|
11
|
-
ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
|
|
12
|
-
ohmyscrapper/modules/process_with_ai.py,sha256=kl39Jzl-PUwh6AfmTZ9SLFUYs9Sk4biqgt8rNz3X1FA,7255
|
|
13
|
-
ohmyscrapper/modules/scrap_urls.py,sha256=uN5j0dychVMGu7n1rcpYdba4sqc47ssyCn0tVaiz-Ic,6264
|
|
14
|
-
ohmyscrapper/modules/seed.py,sha256=hHEGSoPXsmclTaRPeIcK2oC1Xpg3_JqBv_YFMD0m5Jw,1044
|
|
15
|
-
ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
|
|
16
|
-
ohmyscrapper/modules/sniff_url.py,sha256=1QnxEdCWLjLh0uM72dlPzst64qglqg2MHA_xYlNcLSA,5435
|
|
17
|
-
ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
|
|
18
|
-
ohmyscrapper-0.7.4.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
|
|
19
|
-
ohmyscrapper-0.7.4.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
|
|
20
|
-
ohmyscrapper-0.7.4.dist-info/METADATA,sha256=CVE8WUcraUtONy9UVIU0y8Y7wjsk4zEmMVfpA_al1CU,4261
|
|
21
|
-
ohmyscrapper-0.7.4.dist-info/RECORD,,
|
|
File without changes
|