ohmyscrapper 0.4.0__tar.gz → 0.7.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/PKG-INFO +6 -3
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/README.md +1 -1
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/pyproject.toml +9 -2
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/__init__.py +41 -6
- ohmyscrapper-0.7.4/src/ohmyscrapper/core/config.py +113 -0
- ohmyscrapper-0.7.4/src/ohmyscrapper/core/config_files.py +100 -0
- ohmyscrapper-0.7.4/src/ohmyscrapper/core/default_files/config.yaml +19 -0
- ohmyscrapper-0.7.4/src/ohmyscrapper/core/default_files/url_sniffing.yaml +29 -0
- ohmyscrapper-0.7.4/src/ohmyscrapper/core/default_files/url_types.yaml +5 -0
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/models/urls_manager.py +58 -31
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/modules/classify_urls.py +5 -1
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/modules/load_txt.py +26 -20
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/modules/process_with_ai.py +17 -13
- ohmyscrapper-0.7.4/src/ohmyscrapper/modules/scrap_urls.py +193 -0
- ohmyscrapper-0.7.4/src/ohmyscrapper/modules/seed.py +36 -0
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/modules/show.py +15 -14
- ohmyscrapper-0.7.4/src/ohmyscrapper/modules/sniff_url.py +165 -0
- ohmyscrapper-0.4.0/src/ohmyscrapper/modules/scrap_urls.py +0 -209
- ohmyscrapper-0.4.0/src/ohmyscrapper/modules/seed.py +0 -7
- ohmyscrapper-0.4.0/src/ohmyscrapper/modules/sniff_url.py +0 -88
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/__main__.py +0 -0
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/modules/merge_dbs.py +0 -0
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.4}/src/ohmyscrapper/modules/untouch_all.py +0 -0
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ohmyscrapper
|
|
3
|
-
Version: 0.4
|
|
3
|
+
Version: 0.7.4
|
|
4
4
|
Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
|
|
5
5
|
Author: Cesar Cardoso
|
|
6
6
|
Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
|
|
7
|
+
License-Expression: MIT
|
|
7
8
|
Requires-Dist: beautifulsoup4>=4.14.3
|
|
8
9
|
Requires-Dist: google-genai>=1.55.0
|
|
9
10
|
Requires-Dist: markdown>=3.10
|
|
@@ -14,9 +15,11 @@ Requires-Dist: requests>=2.32.5
|
|
|
14
15
|
Requires-Dist: rich>=14.2.0
|
|
15
16
|
Requires-Dist: urlextract>=1.9.0
|
|
16
17
|
Requires-Python: >=3.11
|
|
18
|
+
Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
|
|
19
|
+
Project-URL: Repository, https://github.com/bouli/ohmyscrapper
|
|
17
20
|
Description-Content-Type: text/markdown
|
|
18
21
|
|
|
19
|
-
# 🐶 OhMyScrapper - v0.4
|
|
22
|
+
# 🐶 OhMyScrapper - v0.7.4
|
|
20
23
|
|
|
21
24
|
OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
|
|
22
25
|
final report with general information about job positions.
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ohmyscrapper"
|
|
3
|
-
version = "0.4
|
|
3
|
+
version = "0.7.4"
|
|
4
|
+
license = "MIT"
|
|
4
5
|
description = "OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions."
|
|
5
6
|
readme = "README.md"
|
|
6
7
|
authors = [
|
|
@@ -19,6 +20,10 @@ dependencies = [
|
|
|
19
20
|
"urlextract>=1.9.0",
|
|
20
21
|
]
|
|
21
22
|
|
|
23
|
+
[project.urls]
|
|
24
|
+
Repository = "https://github.com/bouli/ohmyscrapper"
|
|
25
|
+
Changelog = "https://github.com/bouli/ohmyscrapper/releases/latest"
|
|
26
|
+
|
|
22
27
|
[project.scripts]
|
|
23
28
|
ohmyscrapper = "ohmyscrapper:main"
|
|
24
29
|
|
|
@@ -29,11 +34,13 @@ build-backend = "uv_build"
|
|
|
29
34
|
[tool.bumpversion]
|
|
30
35
|
tag = true
|
|
31
36
|
tag_name = "v{new_version}"
|
|
32
|
-
pre_commit_hooks = ["uv sync --upgrade", "git add uv.lock"]
|
|
37
|
+
pre_commit_hooks = ["uvx black ./src", "git add src", "git commit -m 'chore: Beautify with black'", "uv sync --upgrade", "git add uv.lock"]
|
|
33
38
|
commit = true
|
|
34
39
|
|
|
35
40
|
[[tool.bumpversion.files]]
|
|
36
41
|
filename = "pyproject.toml"
|
|
42
|
+
search = 'version = "{current_version}"'
|
|
43
|
+
replace = 'version = "{new_version}"'
|
|
37
44
|
|
|
38
45
|
[[tool.bumpversion.files]]
|
|
39
46
|
filename = "README.md"
|
|
@@ -3,7 +3,7 @@ import argparse
|
|
|
3
3
|
from ohmyscrapper.modules.classify_urls import classify_urls
|
|
4
4
|
from ohmyscrapper.modules.sniff_url import sniff_url
|
|
5
5
|
from ohmyscrapper.modules.load_txt import load_txt
|
|
6
|
-
from ohmyscrapper.modules.seed import seed
|
|
6
|
+
from ohmyscrapper.modules.seed import seed, export_url_types_to_file
|
|
7
7
|
from ohmyscrapper.modules.scrap_urls import scrap_urls
|
|
8
8
|
from ohmyscrapper.modules.show import (
|
|
9
9
|
show_url,
|
|
@@ -15,17 +15,22 @@ from ohmyscrapper.modules.show import (
|
|
|
15
15
|
from ohmyscrapper.modules.untouch_all import untouch_all
|
|
16
16
|
from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
|
|
17
17
|
from ohmyscrapper.modules.merge_dbs import merge_dbs
|
|
18
|
+
from ohmyscrapper.core.config import update
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
def main():
|
|
21
22
|
parser = argparse.ArgumentParser(prog="ohmyscrapper")
|
|
22
|
-
parser.add_argument("--version", action="version", version="%(prog)s v0.4
|
|
23
|
+
parser.add_argument("--version", action="version", version="%(prog)s v0.7.4")
|
|
23
24
|
|
|
25
|
+
update()
|
|
24
26
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
25
27
|
start_parser = subparsers.add_parser(
|
|
26
28
|
"start",
|
|
27
29
|
help="Make the entire process of 📦 loading, 🐶 scraping and 📜🖋️ exporting with the default configuration.",
|
|
28
30
|
)
|
|
31
|
+
start_parser.add_argument(
|
|
32
|
+
"-input", default=None, help="File/Folder path or url for pre-loading."
|
|
33
|
+
)
|
|
29
34
|
|
|
30
35
|
start_parser.add_argument(
|
|
31
36
|
"--ai",
|
|
@@ -40,8 +45,22 @@ def main():
|
|
|
40
45
|
)
|
|
41
46
|
|
|
42
47
|
seed_parser = subparsers.add_parser(
|
|
43
|
-
"seed", help="Seed database
|
|
48
|
+
"seed", help="Seed database with `url_types` to classify the `urls`."
|
|
49
|
+
)
|
|
50
|
+
seed_parser.add_argument(
|
|
51
|
+
"--export",
|
|
52
|
+
default=False,
|
|
53
|
+
help="Add all `url_types` from the bank to the `/ohmyscrapper/url_types.yaml` file.",
|
|
54
|
+
action="store_true",
|
|
44
55
|
)
|
|
56
|
+
|
|
57
|
+
seed_parser.add_argument(
|
|
58
|
+
"--reset",
|
|
59
|
+
default=False,
|
|
60
|
+
help="Reset all `url_types`.",
|
|
61
|
+
action="store_true",
|
|
62
|
+
)
|
|
63
|
+
|
|
45
64
|
untouch_parser = subparsers.add_parser(
|
|
46
65
|
"untouch-all", help="Untouch all urls. That resets classification"
|
|
47
66
|
)
|
|
@@ -54,7 +73,9 @@ def main():
|
|
|
54
73
|
)
|
|
55
74
|
|
|
56
75
|
load_txt_parser = subparsers.add_parser("load", help="📦 Load txt file")
|
|
57
|
-
load_txt_parser.add_argument(
|
|
76
|
+
load_txt_parser.add_argument(
|
|
77
|
+
"-input", default=None, help="File/Folder path or url."
|
|
78
|
+
)
|
|
58
79
|
load_txt_parser.add_argument(
|
|
59
80
|
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
60
81
|
)
|
|
@@ -75,6 +96,9 @@ def main():
|
|
|
75
96
|
scrap_urls_parser.add_argument(
|
|
76
97
|
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
77
98
|
)
|
|
99
|
+
scrap_urls_parser.add_argument(
|
|
100
|
+
"-input", default=None, help="File/Folder path or url for pre-loading."
|
|
101
|
+
)
|
|
78
102
|
|
|
79
103
|
sniff_url_parser = subparsers.add_parser("sniff-url", help="🐕 Sniff/Check url")
|
|
80
104
|
sniff_url_parser.add_argument(
|
|
@@ -118,7 +142,10 @@ def main():
|
|
|
118
142
|
return
|
|
119
143
|
|
|
120
144
|
if args.command == "seed":
|
|
121
|
-
|
|
145
|
+
if args.export:
|
|
146
|
+
export_url_types_to_file()
|
|
147
|
+
else:
|
|
148
|
+
seed(args.reset)
|
|
122
149
|
return
|
|
123
150
|
|
|
124
151
|
if args.command == "untouch-all":
|
|
@@ -130,6 +157,9 @@ def main():
|
|
|
130
157
|
return
|
|
131
158
|
|
|
132
159
|
if args.command == "scrap-urls":
|
|
160
|
+
if args.input != None:
|
|
161
|
+
load_txt(file_name=args.input, verbose=args.verbose)
|
|
162
|
+
|
|
133
163
|
scrap_urls(
|
|
134
164
|
recursive=args.recursive,
|
|
135
165
|
ignore_valid_prefix=args.ignore_type,
|
|
@@ -169,7 +199,12 @@ def main():
|
|
|
169
199
|
return
|
|
170
200
|
|
|
171
201
|
if args.command == "start":
|
|
172
|
-
|
|
202
|
+
seed()
|
|
203
|
+
if args.input != None:
|
|
204
|
+
load_txt(file_name=args.input)
|
|
205
|
+
else:
|
|
206
|
+
load_txt()
|
|
207
|
+
|
|
173
208
|
scrap_urls(
|
|
174
209
|
recursive=True,
|
|
175
210
|
ignore_valid_prefix=True,
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from ohmyscrapper.core import config_files
|
|
3
|
+
|
|
4
|
+
default_app_dir = "ohmyscrapper"
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_dir(param="ohmyscrapper"):
|
|
8
|
+
parent_param = "default_dirs"
|
|
9
|
+
|
|
10
|
+
if param == default_app_dir:
|
|
11
|
+
folder = "./" + param
|
|
12
|
+
else:
|
|
13
|
+
folder = config_files.get_param(
|
|
14
|
+
parent_param=parent_param, param=param, default_app_dir=default_app_dir
|
|
15
|
+
)
|
|
16
|
+
if not os.path.exists(folder):
|
|
17
|
+
os.mkdir(folder)
|
|
18
|
+
return folder
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_files(param):
|
|
22
|
+
parent_param = "default_files"
|
|
23
|
+
return config_files.get_param(
|
|
24
|
+
parent_param=parent_param, param=param, default_app_dir=default_app_dir
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_db(param="db_file"):
|
|
29
|
+
if param == "folder":
|
|
30
|
+
return get_dir(param="db")
|
|
31
|
+
return config_files.get_param(
|
|
32
|
+
parent_param="db", param=param, default_app_dir=default_app_dir
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_ai(param):
|
|
37
|
+
return config_files.get_param(
|
|
38
|
+
parent_param="ai", param=param, default_app_dir=default_app_dir
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_sniffing(param):
|
|
43
|
+
return config_files.get_param(
|
|
44
|
+
parent_param="sniffing", param=param, default_app_dir=default_app_dir
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def load_config(force_default=False):
|
|
49
|
+
config_file_name = "config.yaml"
|
|
50
|
+
config_params = config_files.create_and_read_config_file(
|
|
51
|
+
file_name=config_file_name,
|
|
52
|
+
default_app_dir=default_app_dir,
|
|
53
|
+
force_default=force_default,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
if config_params is None or "default_dirs" not in config_params:
|
|
57
|
+
config_params = load_config(force_default=True)
|
|
58
|
+
|
|
59
|
+
return config_params
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def url_types_file_exists():
|
|
63
|
+
url_types_file = get_files("url_types")
|
|
64
|
+
return config_files.config_file_exists(
|
|
65
|
+
url_types_file, default_app_dir=default_app_dir
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def get_url_types():
|
|
70
|
+
url_types_file = get_files("url_types")
|
|
71
|
+
return config_files.create_and_read_config_file(
|
|
72
|
+
url_types_file, default_app_dir=default_app_dir
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_url_sniffing():
|
|
77
|
+
file = get_files("url_sniffing")
|
|
78
|
+
return config_files.create_and_read_config_file(
|
|
79
|
+
file, default_app_dir=default_app_dir
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def append_url_sniffing(data):
|
|
84
|
+
file = get_files("url_sniffing")
|
|
85
|
+
_append_config_file(data, file)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def append_url_types(url_types):
|
|
89
|
+
url_types_file = get_files("url_types")
|
|
90
|
+
_append_config_file(url_types, url_types_file)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def overwrite_config_file(data, file_name):
|
|
94
|
+
config_files.overwrite_config_file(data, file_name, default_app_dir=default_app_dir)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _append_config_file(data, file_name):
|
|
98
|
+
config_files.append_config_file(data, file_name, default_app_dir=default_app_dir)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def update():
|
|
102
|
+
legacy_folder = "./customize"
|
|
103
|
+
new_folder = "./ohmyscrapper"
|
|
104
|
+
if os.path.exists(legacy_folder) and not os.path.exists(new_folder):
|
|
105
|
+
yes_no = input(
|
|
106
|
+
"We detected a legacy folder system for your OhMyScrapper, would you like to update? \n"
|
|
107
|
+
"If you don't update, a new version will be used and your legacy folder will be ignored. \n"
|
|
108
|
+
"[Y] for yes or any other thing to ignore: "
|
|
109
|
+
)
|
|
110
|
+
if yes_no == "Y":
|
|
111
|
+
os.rename(legacy_folder, new_folder)
|
|
112
|
+
print(" You are up-to-date! =)")
|
|
113
|
+
print("")
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import yaml
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def create_and_read_config_file(file_name, default_app_dir, force_default=False):
|
|
6
|
+
config_file = config_file_path(file_name, default_app_dir)
|
|
7
|
+
default_config_params = _get_default_file(default_file=file_name)
|
|
8
|
+
if force_default or not os.path.exists(config_file):
|
|
9
|
+
overwrite_config_file(
|
|
10
|
+
data=default_config_params,
|
|
11
|
+
file_name=file_name,
|
|
12
|
+
default_app_dir=default_app_dir,
|
|
13
|
+
)
|
|
14
|
+
config_params = default_config_params
|
|
15
|
+
else:
|
|
16
|
+
with open(config_file, "r") as f:
|
|
17
|
+
config_params = yaml.safe_load(f.read())
|
|
18
|
+
if complete_config_file(
|
|
19
|
+
config_params=config_params,
|
|
20
|
+
default_config_params=default_config_params,
|
|
21
|
+
file_name=file_name,
|
|
22
|
+
default_app_dir=default_app_dir,
|
|
23
|
+
):
|
|
24
|
+
config_params = create_and_read_config_file(
|
|
25
|
+
file_name=file_name,
|
|
26
|
+
default_app_dir=default_app_dir,
|
|
27
|
+
force_default=force_default,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
if config_params is None:
|
|
31
|
+
config_params = create_and_read_config_file(
|
|
32
|
+
file_name=file_name, default_app_dir=default_app_dir, force_default=True
|
|
33
|
+
)
|
|
34
|
+
return config_params
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def complete_config_file(
|
|
38
|
+
config_params, default_config_params, file_name, default_app_dir
|
|
39
|
+
):
|
|
40
|
+
has_updated = False
|
|
41
|
+
for key, values in default_config_params.items():
|
|
42
|
+
if key not in config_params.keys():
|
|
43
|
+
has_updated = True
|
|
44
|
+
data = {key: values}
|
|
45
|
+
append_config_file(data, file_name, default_app_dir)
|
|
46
|
+
return has_updated
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def overwrite_config_file(data, file_name, default_app_dir):
|
|
50
|
+
config_file = config_file_path(file_name, default_app_dir)
|
|
51
|
+
with open(config_file, "+w") as f:
|
|
52
|
+
f.write(yaml.safe_dump(data))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def append_config_file(data, file_name, default_app_dir):
|
|
56
|
+
config_file = config_file_path(file_name, default_app_dir)
|
|
57
|
+
# append
|
|
58
|
+
with open(config_file, "+a") as f:
|
|
59
|
+
yaml.dump(data, f, allow_unicode=True)
|
|
60
|
+
# read
|
|
61
|
+
with open(config_file, "r") as f:
|
|
62
|
+
data = yaml.safe_load(f.read())
|
|
63
|
+
# overwrite preventing repetition
|
|
64
|
+
with open(config_file, "w") as f:
|
|
65
|
+
yaml.dump(data, f, allow_unicode=True)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_param(parent_param, param, default_app_dir):
|
|
69
|
+
default_dirs = create_and_read_config_file(
|
|
70
|
+
file_name="config.yaml", default_app_dir=default_app_dir
|
|
71
|
+
)[parent_param]
|
|
72
|
+
|
|
73
|
+
if param in default_dirs:
|
|
74
|
+
return default_dirs[param]
|
|
75
|
+
else:
|
|
76
|
+
raise Exception(f"{param} do not exist in your params {parent_param}.")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def config_file_exists(file_name, default_app_dir):
|
|
80
|
+
return os.path.exists(config_file_path(file_name, default_app_dir))
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def config_file_path(file_name, default_app_dir):
|
|
84
|
+
_ensure_default_app_dir(default_app_dir)
|
|
85
|
+
config_file = os.path.join(default_app_dir, file_name)
|
|
86
|
+
return config_file
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _ensure_default_app_dir(default_app_dir):
|
|
90
|
+
if not os.path.exists(default_app_dir):
|
|
91
|
+
os.mkdir(default_app_dir)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _get_default_file(default_file):
|
|
95
|
+
default_files_dir = os.path.join(
|
|
96
|
+
os.path.dirname(os.path.realpath(__file__)), "default_files"
|
|
97
|
+
)
|
|
98
|
+
default_file = os.path.join(default_files_dir, default_file)
|
|
99
|
+
with open(default_file, "r") as f:
|
|
100
|
+
return yaml.safe_load(f.read())
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
db:
|
|
2
|
+
db_file: local.db
|
|
3
|
+
|
|
4
|
+
default_dirs:
|
|
5
|
+
db: ./db
|
|
6
|
+
input: ./input
|
|
7
|
+
output: ./output
|
|
8
|
+
prompts: ./prompts
|
|
9
|
+
templates: ./templates
|
|
10
|
+
|
|
11
|
+
default_files:
|
|
12
|
+
url_types: url_types.yaml
|
|
13
|
+
url_sniffing: url_sniffing.yaml
|
|
14
|
+
|
|
15
|
+
ai:
|
|
16
|
+
default_prompt_file: prompt.md
|
|
17
|
+
|
|
18
|
+
sniffing:
|
|
19
|
+
timeout: 10
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
linkedin_feed:
|
|
2
|
+
metatags:
|
|
3
|
+
og:url: url_destiny
|
|
4
|
+
|
|
5
|
+
linkedin_job:
|
|
6
|
+
bodytags:
|
|
7
|
+
h1: title
|
|
8
|
+
metatags:
|
|
9
|
+
og:title: title
|
|
10
|
+
og:description: description
|
|
11
|
+
description: description
|
|
12
|
+
|
|
13
|
+
linkedin_post:
|
|
14
|
+
bodytags:
|
|
15
|
+
h1: title
|
|
16
|
+
metatags:
|
|
17
|
+
og:title: title
|
|
18
|
+
og:description: description
|
|
19
|
+
description: description
|
|
20
|
+
|
|
21
|
+
linkedin_redirect:
|
|
22
|
+
metatags:
|
|
23
|
+
og:url: url_destiny
|
|
24
|
+
atags:
|
|
25
|
+
first-tag-as-url_destiny: 5
|
|
26
|
+
|
|
27
|
+
read_all_a_tags:
|
|
28
|
+
atags:
|
|
29
|
+
load_atags: True
|
|
@@ -4,16 +4,19 @@ import time
|
|
|
4
4
|
import glob
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from urllib.parse import urlparse, urlunparse
|
|
7
|
+
from ohmyscrapper.core import config
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def get_db_dir():
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
db_folder = config.get_dir("db")
|
|
12
|
+
if not os.path.exists(db_folder):
|
|
13
|
+
os.mkdir(db_folder)
|
|
14
|
+
return db_folder
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
def get_db_path():
|
|
16
|
-
|
|
18
|
+
db_file = config.get_db()
|
|
19
|
+
return os.path.join(get_db_dir(), db_file)
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
def get_db_connection():
|
|
@@ -26,7 +29,11 @@ def use_connection(func):
|
|
|
26
29
|
def provide_connection(*args, **kwargs):
|
|
27
30
|
global conn
|
|
28
31
|
with get_db_connection() as conn:
|
|
29
|
-
|
|
32
|
+
try:
|
|
33
|
+
return func(*args, **kwargs)
|
|
34
|
+
except:
|
|
35
|
+
update_db()
|
|
36
|
+
return func(*args, **kwargs)
|
|
30
37
|
|
|
31
38
|
return provide_connection
|
|
32
39
|
|
|
@@ -35,7 +42,7 @@ def create_tables(conn):
|
|
|
35
42
|
|
|
36
43
|
c = conn.cursor()
|
|
37
44
|
c.execute(
|
|
38
|
-
"CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT,
|
|
45
|
+
"CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, title TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
|
|
39
46
|
)
|
|
40
47
|
c.execute(
|
|
41
48
|
"CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, prompt_file STRING, prompt_name STRING, created_at DATETIME)"
|
|
@@ -46,16 +53,30 @@ def create_tables(conn):
|
|
|
46
53
|
)
|
|
47
54
|
|
|
48
55
|
|
|
49
|
-
def
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
56
|
+
def update_db():
|
|
57
|
+
try:
|
|
58
|
+
c = conn.cursor()
|
|
59
|
+
c.execute("ALTER TABLE urls RENAME COLUMN h1 TO title")
|
|
60
|
+
except:
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def seeds(seeds={}):
|
|
65
|
+
|
|
66
|
+
for url_type, url_prefix in seeds.items():
|
|
67
|
+
add_urls_valid_prefix(url_prefix, url_type)
|
|
55
68
|
|
|
56
69
|
return True
|
|
57
70
|
|
|
58
71
|
|
|
72
|
+
@use_connection
|
|
73
|
+
def reset_seeds():
|
|
74
|
+
sql = "DELETE FROM urls_valid_prefix"
|
|
75
|
+
c = conn.cursor()
|
|
76
|
+
c.execute(sql)
|
|
77
|
+
conn.commit()
|
|
78
|
+
|
|
79
|
+
|
|
59
80
|
@use_connection
|
|
60
81
|
def add_urls_valid_prefix(url_prefix, url_type):
|
|
61
82
|
|
|
@@ -117,7 +138,7 @@ def get_urls_report():
|
|
|
117
138
|
SELECT
|
|
118
139
|
u.id,
|
|
119
140
|
u.url,
|
|
120
|
-
u.
|
|
141
|
+
u.title
|
|
121
142
|
FROM urls u
|
|
122
143
|
INNER JOIN parent_url p
|
|
123
144
|
ON u.url = p.parent_url
|
|
@@ -126,9 +147,9 @@ def get_urls_report():
|
|
|
126
147
|
u.id,
|
|
127
148
|
u.url_type,
|
|
128
149
|
u.url,
|
|
129
|
-
COALESCE(u.
|
|
150
|
+
COALESCE(u.title, p.title) as title,
|
|
130
151
|
p.url as parent_url,
|
|
131
|
-
p.
|
|
152
|
+
p.title as parent_title
|
|
132
153
|
FROM urls u
|
|
133
154
|
LEFT JOIN parents p
|
|
134
155
|
ON u.parent_url = p.url
|
|
@@ -184,12 +205,14 @@ def get_url_like_unclassified(like_condition):
|
|
|
184
205
|
|
|
185
206
|
|
|
186
207
|
@use_connection
|
|
187
|
-
def add_url(url,
|
|
208
|
+
def add_url(url, title=None, parent_url=None):
|
|
209
|
+
if url[:1] == "/":
|
|
210
|
+
return
|
|
188
211
|
url = clean_url(url)
|
|
189
212
|
c = conn.cursor()
|
|
190
213
|
|
|
191
|
-
if
|
|
192
|
-
|
|
214
|
+
if title is not None:
|
|
215
|
+
title = title.strip()
|
|
193
216
|
|
|
194
217
|
if parent_url is None:
|
|
195
218
|
parent_url = None
|
|
@@ -198,8 +221,8 @@ def add_url(url, h1=None, parent_url=None):
|
|
|
198
221
|
|
|
199
222
|
if len(get_url_by_url(url)) == 0:
|
|
200
223
|
c.execute(
|
|
201
|
-
"INSERT INTO urls (url,
|
|
202
|
-
(url,
|
|
224
|
+
"INSERT INTO urls (url, title, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
|
|
225
|
+
(url, title, parent_url, int(time.time())),
|
|
203
226
|
)
|
|
204
227
|
conn.commit()
|
|
205
228
|
|
|
@@ -238,20 +261,20 @@ def set_url_destiny(url, destiny):
|
|
|
238
261
|
|
|
239
262
|
|
|
240
263
|
@use_connection
|
|
241
|
-
def
|
|
264
|
+
def set_url_title(url, value):
|
|
242
265
|
value = str(value).strip()
|
|
243
266
|
url = clean_url(url)
|
|
244
267
|
c = conn.cursor()
|
|
245
|
-
c.execute("UPDATE urls SET
|
|
268
|
+
c.execute("UPDATE urls SET title = ? WHERE url = ?", (value, url))
|
|
246
269
|
conn.commit()
|
|
247
270
|
|
|
248
271
|
|
|
249
272
|
@use_connection
|
|
250
|
-
def
|
|
273
|
+
def set_url_title_by_id(id, value):
|
|
251
274
|
value = str(value).strip()
|
|
252
275
|
|
|
253
276
|
c = conn.cursor()
|
|
254
|
-
c.execute("UPDATE urls SET
|
|
277
|
+
c.execute("UPDATE urls SET title = ? WHERE id = ?", (value, id))
|
|
255
278
|
conn.commit()
|
|
256
279
|
|
|
257
280
|
|
|
@@ -327,7 +350,9 @@ def set_url_error(url, value):
|
|
|
327
350
|
@use_connection
|
|
328
351
|
def set_url_type_by_id(url_id, url_type):
|
|
329
352
|
c = conn.cursor()
|
|
330
|
-
c.execute(
|
|
353
|
+
c.execute(
|
|
354
|
+
f"UPDATE urls SET url_type = '{url_type}', last_touch = NULL WHERE id = {url_id}"
|
|
355
|
+
)
|
|
331
356
|
conn.commit()
|
|
332
357
|
|
|
333
358
|
|
|
@@ -379,8 +404,10 @@ def touch_url(url):
|
|
|
379
404
|
@use_connection
|
|
380
405
|
def untouch_url(url):
|
|
381
406
|
url = clean_url(url)
|
|
407
|
+
url = str(url.strip())
|
|
408
|
+
|
|
382
409
|
c = conn.cursor()
|
|
383
|
-
c.execute("UPDATE urls SET last_touch = NULL WHERE url =
|
|
410
|
+
c.execute(f"UPDATE urls SET last_touch = NULL, url_type = NULL WHERE url = '{url}'")
|
|
384
411
|
conn.commit()
|
|
385
412
|
|
|
386
413
|
|
|
@@ -426,16 +453,16 @@ def merge_dbs() -> None:
|
|
|
426
453
|
|
|
427
454
|
|
|
428
455
|
@use_connection
|
|
429
|
-
def merge_url(url,
|
|
456
|
+
def merge_url(url, title, last_touch, created_at, description, json):
|
|
430
457
|
url = clean_url(url)
|
|
431
458
|
c = conn.cursor()
|
|
432
459
|
|
|
433
|
-
if
|
|
434
|
-
|
|
460
|
+
if title is not None:
|
|
461
|
+
title = title.strip()
|
|
435
462
|
|
|
436
463
|
if len(get_url_by_url(url)) == 0:
|
|
437
464
|
c.execute(
|
|
438
|
-
"INSERT INTO urls (url,
|
|
439
|
-
(url,
|
|
465
|
+
"INSERT INTO urls (url, title, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
|
|
466
|
+
(url, title, last_touch, created_at, description, json),
|
|
440
467
|
)
|
|
441
468
|
conn.commit()
|
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
from ohmyscrapper.modules import seed
|
|
2
3
|
import pandas as pd
|
|
3
4
|
import time
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def classify_urls(recursive=False):
|
|
7
|
-
urls_manager.seeds()
|
|
8
8
|
df = urls_manager.get_urls_valid_prefix()
|
|
9
|
+
if len(df) == 0:
|
|
10
|
+
seed.seed()
|
|
11
|
+
classify_urls(recursive=recursive)
|
|
12
|
+
return
|
|
9
13
|
|
|
10
14
|
keep_alive = True
|
|
11
15
|
while keep_alive:
|