ohmyscrapper 0.2.1__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ohmyscrapper/__init__.py +57 -18
- ohmyscrapper/core/config.py +95 -0
- ohmyscrapper/core/config_files.py +73 -0
- ohmyscrapper/core/default_files/config.yaml +15 -0
- ohmyscrapper/core/default_files/url_types.yaml +5 -0
- ohmyscrapper/models/urls_manager.py +67 -25
- ohmyscrapper/modules/classify_urls.py +9 -5
- ohmyscrapper/modules/load_txt.py +79 -11
- ohmyscrapper/modules/process_with_ai.py +72 -36
- ohmyscrapper/modules/scrap_urls.py +80 -49
- ohmyscrapper/modules/seed.py +28 -2
- ohmyscrapper/modules/show.py +22 -14
- ohmyscrapper/modules/sniff_url.py +82 -38
- ohmyscrapper/modules/untouch_all.py +1 -1
- {ohmyscrapper-0.2.1.dist-info → ohmyscrapper-0.6.1.dist-info}/METADATA +53 -27
- ohmyscrapper-0.6.1.dist-info/RECORD +20 -0
- ohmyscrapper-0.2.1.dist-info/RECORD +0 -16
- {ohmyscrapper-0.2.1.dist-info → ohmyscrapper-0.6.1.dist-info}/WHEEL +0 -0
- {ohmyscrapper-0.2.1.dist-info → ohmyscrapper-0.6.1.dist-info}/entry_points.txt +0 -0
ohmyscrapper/__init__.py
CHANGED
|
@@ -3,7 +3,7 @@ import argparse
|
|
|
3
3
|
from ohmyscrapper.modules.classify_urls import classify_urls
|
|
4
4
|
from ohmyscrapper.modules.sniff_url import sniff_url
|
|
5
5
|
from ohmyscrapper.modules.load_txt import load_txt
|
|
6
|
-
from ohmyscrapper.modules.seed import seed
|
|
6
|
+
from ohmyscrapper.modules.seed import seed, export_url_types_to_file
|
|
7
7
|
from ohmyscrapper.modules.scrap_urls import scrap_urls
|
|
8
8
|
from ohmyscrapper.modules.show import (
|
|
9
9
|
show_url,
|
|
@@ -15,23 +15,40 @@ from ohmyscrapper.modules.show import (
|
|
|
15
15
|
from ohmyscrapper.modules.untouch_all import untouch_all
|
|
16
16
|
from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
|
|
17
17
|
from ohmyscrapper.modules.merge_dbs import merge_dbs
|
|
18
|
+
from ohmyscrapper.core.config import update
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
def main():
|
|
21
22
|
parser = argparse.ArgumentParser(prog="ohmyscrapper")
|
|
22
|
-
parser.add_argument("--version", action="version", version="%(prog)s v0.
|
|
23
|
+
parser.add_argument("--version", action="version", version="%(prog)s v0.6.1")
|
|
23
24
|
|
|
25
|
+
update()
|
|
24
26
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
27
|
+
start_parser = subparsers.add_parser(
|
|
28
|
+
"start",
|
|
29
|
+
help="Make the entire process of 📦 loading, 🐶 scraping and 📜🖋️ exporting with the default configuration.",
|
|
30
|
+
)
|
|
25
31
|
|
|
26
|
-
|
|
27
|
-
"
|
|
32
|
+
start_parser.add_argument(
|
|
33
|
+
"--ai",
|
|
34
|
+
default=False,
|
|
35
|
+
help="Make the entire process of loading, processing, reprocessing with AI and exporting with the default configuration.",
|
|
36
|
+
action="store_true",
|
|
28
37
|
)
|
|
38
|
+
|
|
39
|
+
ai_process_parser = subparsers.add_parser("ai", help="Process with AI.")
|
|
29
40
|
ai_process_parser.add_argument(
|
|
30
41
|
"--history", default=False, help="Reprocess ai history", action="store_true"
|
|
31
42
|
)
|
|
32
43
|
|
|
33
44
|
seed_parser = subparsers.add_parser(
|
|
34
|
-
"seed", help="Seed database
|
|
45
|
+
"seed", help="Seed database with `url_types` to classify the `urls`."
|
|
46
|
+
)
|
|
47
|
+
seed_parser.add_argument(
|
|
48
|
+
"--export",
|
|
49
|
+
default=False,
|
|
50
|
+
help="Add all `url_types` from the bank to the `/ohmyscrapper/url_types.yaml` file.",
|
|
51
|
+
action="store_true",
|
|
35
52
|
)
|
|
36
53
|
untouch_parser = subparsers.add_parser(
|
|
37
54
|
"untouch-all", help="Untouch all urls. That resets classification"
|
|
@@ -44,12 +61,15 @@ def main():
|
|
|
44
61
|
"--recursive", default=False, help="Run in recursive mode", action="store_true"
|
|
45
62
|
)
|
|
46
63
|
|
|
47
|
-
load_txt_parser = subparsers.add_parser("load", help="Load txt file")
|
|
64
|
+
load_txt_parser = subparsers.add_parser("load", help="📦 Load txt file")
|
|
48
65
|
load_txt_parser.add_argument(
|
|
49
|
-
"-
|
|
66
|
+
"-input", default=None, help="File/Folder path or url."
|
|
67
|
+
)
|
|
68
|
+
load_txt_parser.add_argument(
|
|
69
|
+
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
50
70
|
)
|
|
51
71
|
|
|
52
|
-
scrap_urls_parser = subparsers.add_parser("scrap-urls", help="Scrap urls")
|
|
72
|
+
scrap_urls_parser = subparsers.add_parser("scrap-urls", help="🐶 Scrap urls")
|
|
53
73
|
scrap_urls_parser.add_argument(
|
|
54
74
|
"--recursive", default=False, help="Run in recursive mode", action="store_true"
|
|
55
75
|
)
|
|
@@ -62,8 +82,11 @@ def main():
|
|
|
62
82
|
scrap_urls_parser.add_argument(
|
|
63
83
|
"--only-parents", default=False, help="Only parents urls", action="store_true"
|
|
64
84
|
)
|
|
85
|
+
scrap_urls_parser.add_argument(
|
|
86
|
+
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
87
|
+
)
|
|
65
88
|
|
|
66
|
-
sniff_url_parser = subparsers.add_parser("sniff-url", help="Check url")
|
|
89
|
+
sniff_url_parser = subparsers.add_parser("sniff-url", help="🐕 Sniff/Check url")
|
|
67
90
|
sniff_url_parser.add_argument(
|
|
68
91
|
"url", default="https://cesarcardoso.cc/", help="Url to sniff"
|
|
69
92
|
)
|
|
@@ -75,7 +98,7 @@ def main():
|
|
|
75
98
|
show_urls_parser.add_argument("--limit", default=0, help="Limit of lines to show")
|
|
76
99
|
show_urls_parser.add_argument("-url", default="", help="Url to show")
|
|
77
100
|
|
|
78
|
-
export_parser = subparsers.add_parser("export", help="Export urls to csv.")
|
|
101
|
+
export_parser = subparsers.add_parser("export", help="📊🖋️ Export urls to csv.")
|
|
79
102
|
export_parser.add_argument("--limit", default=0, help="Limit of lines to export")
|
|
80
103
|
export_parser.add_argument(
|
|
81
104
|
"--file",
|
|
@@ -89,14 +112,11 @@ def main():
|
|
|
89
112
|
action="store_true",
|
|
90
113
|
)
|
|
91
114
|
|
|
92
|
-
report_parser = subparsers.add_parser(
|
|
115
|
+
report_parser = subparsers.add_parser(
|
|
116
|
+
"report", help="📜🖋️ Export urls report to csv."
|
|
117
|
+
)
|
|
93
118
|
merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
|
|
94
119
|
|
|
95
|
-
# TODO: What is that?
|
|
96
|
-
# seed_parser.set_defaults(func=seed)
|
|
97
|
-
# classify_urls_parser.set_defaults(func=classify_urls)
|
|
98
|
-
# load_txt_parser.set_defaults(func=load_txt)
|
|
99
|
-
|
|
100
120
|
args = parser.parse_args()
|
|
101
121
|
|
|
102
122
|
if args.command == "classify-urls":
|
|
@@ -104,11 +124,14 @@ def main():
|
|
|
104
124
|
return
|
|
105
125
|
|
|
106
126
|
if args.command == "load":
|
|
107
|
-
load_txt(args.
|
|
127
|
+
load_txt(file_name=args.input, verbose=args.verbose)
|
|
108
128
|
return
|
|
109
129
|
|
|
110
130
|
if args.command == "seed":
|
|
111
|
-
|
|
131
|
+
if args.export:
|
|
132
|
+
export_url_types_to_file()
|
|
133
|
+
else:
|
|
134
|
+
seed()
|
|
112
135
|
return
|
|
113
136
|
|
|
114
137
|
if args.command == "untouch-all":
|
|
@@ -125,6 +148,7 @@ def main():
|
|
|
125
148
|
ignore_valid_prefix=args.ignore_type,
|
|
126
149
|
randomize=args.randomize,
|
|
127
150
|
only_parents=args.only_parents,
|
|
151
|
+
verbose=args.verbose,
|
|
128
152
|
)
|
|
129
153
|
return
|
|
130
154
|
|
|
@@ -157,6 +181,21 @@ def main():
|
|
|
157
181
|
merge_dbs()
|
|
158
182
|
return
|
|
159
183
|
|
|
184
|
+
if args.command == "start":
|
|
185
|
+
load_txt()
|
|
186
|
+
scrap_urls(
|
|
187
|
+
recursive=True,
|
|
188
|
+
ignore_valid_prefix=True,
|
|
189
|
+
randomize=False,
|
|
190
|
+
only_parents=False,
|
|
191
|
+
)
|
|
192
|
+
if args.ai:
|
|
193
|
+
process_with_ai()
|
|
194
|
+
export_urls()
|
|
195
|
+
export_urls(csv_file="output/urls-simplified.csv", simplify=True)
|
|
196
|
+
export_report()
|
|
197
|
+
return
|
|
198
|
+
|
|
160
199
|
|
|
161
200
|
if __name__ == "__main__":
|
|
162
201
|
main()
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from ohmyscrapper.core import config_files
|
|
3
|
+
|
|
4
|
+
default_app_dir = "ohmyscrapper"
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_dir(param="ohmyscrapper"):
|
|
8
|
+
parent_param = "default_dirs"
|
|
9
|
+
|
|
10
|
+
if param == default_app_dir:
|
|
11
|
+
folder = "./" + param
|
|
12
|
+
else:
|
|
13
|
+
folder = config_files.get_param(
|
|
14
|
+
parent_param=parent_param, param=param, default_app_dir=default_app_dir
|
|
15
|
+
)
|
|
16
|
+
if not os.path.exists(folder):
|
|
17
|
+
os.mkdir(folder)
|
|
18
|
+
return folder
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_files(param):
|
|
22
|
+
parent_param = "default_files"
|
|
23
|
+
return config_files.get_param(
|
|
24
|
+
parent_param=parent_param, param=param, default_app_dir=default_app_dir
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_db(param="db_file"):
|
|
29
|
+
if param == "folder":
|
|
30
|
+
return get_dir(param="db")
|
|
31
|
+
return config_files.get_param(
|
|
32
|
+
parent_param="db", param=param, default_app_dir=default_app_dir
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_ai(param):
|
|
37
|
+
return config_files.get_param(
|
|
38
|
+
parent_param="ai", param=param, default_app_dir=default_app_dir
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def load_config(force_default=False):
|
|
43
|
+
config_file_name = "config.yaml"
|
|
44
|
+
config_params = config_files.create_and_read_config_file(
|
|
45
|
+
file_name=config_file_name,
|
|
46
|
+
default_app_dir=default_app_dir,
|
|
47
|
+
force_default=force_default,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if config_params is None or "default_dirs" not in config_params:
|
|
51
|
+
config_params = load_config(force_default=True)
|
|
52
|
+
|
|
53
|
+
return config_params
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def url_types_file_exists():
|
|
57
|
+
url_types_file = get_files("url_types")
|
|
58
|
+
return config_files.config_file_exists(
|
|
59
|
+
url_types_file, default_app_dir=default_app_dir
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_url_types():
|
|
64
|
+
url_types_file = get_files("url_types")
|
|
65
|
+
return config_files.create_and_read_config_file(
|
|
66
|
+
url_types_file, default_app_dir=default_app_dir
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def append_url_types(url_types):
|
|
71
|
+
url_types_file = get_files("url_types")
|
|
72
|
+
_append_config_file(url_types, url_types_file)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def overwrite_config_file(data, file_name):
|
|
76
|
+
config_files.overwrite_config_file(data, file_name, default_app_dir=default_app_dir)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _append_config_file(data, file_name):
|
|
80
|
+
config_files.append_config_file(data, file_name, default_app_dir=default_app_dir)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def update():
|
|
84
|
+
legacy_folder = "./customize"
|
|
85
|
+
new_folder = "./ohmyscrapper"
|
|
86
|
+
if os.path.exists(legacy_folder) and not os.path.exists(new_folder):
|
|
87
|
+
yes_no = input(
|
|
88
|
+
"We detected a legacy folder system for your OhMyScrapper, would you like to update? \n"
|
|
89
|
+
"If you don't update, a new version will be used and your legacy folder will be ignored. \n"
|
|
90
|
+
"[Y] for yes or any other thing to ignore: "
|
|
91
|
+
)
|
|
92
|
+
if yes_no == "Y":
|
|
93
|
+
os.rename(legacy_folder, new_folder)
|
|
94
|
+
print(" You are up-to-date! =)")
|
|
95
|
+
print("")
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import yaml
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def create_and_read_config_file(file_name, default_app_dir, force_default=False):
|
|
6
|
+
config_file = config_file_path(file_name, default_app_dir)
|
|
7
|
+
if force_default or not os.path.exists(config_file):
|
|
8
|
+
config_params = _get_default_file(default_file=file_name)
|
|
9
|
+
overwrite_config_file(
|
|
10
|
+
data=config_params, file_name=file_name, default_app_dir=default_app_dir
|
|
11
|
+
)
|
|
12
|
+
else:
|
|
13
|
+
with open(config_file, "r") as f:
|
|
14
|
+
config_params = yaml.safe_load(f.read())
|
|
15
|
+
if config_params is None:
|
|
16
|
+
config_params = create_and_read_config_file(
|
|
17
|
+
file_name=file_name, default_app_dir=default_app_dir, force_default=True
|
|
18
|
+
)
|
|
19
|
+
return config_params
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def overwrite_config_file(data, file_name, default_app_dir):
|
|
23
|
+
config_file = config_file_path(file_name, default_app_dir)
|
|
24
|
+
with open(config_file, "+w") as f:
|
|
25
|
+
f.write(yaml.safe_dump(data))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def append_config_file(data, file_name, default_app_dir):
|
|
29
|
+
config_file = config_file_path(file_name, default_app_dir)
|
|
30
|
+
# append
|
|
31
|
+
with open(config_file, "+a") as f:
|
|
32
|
+
yaml.dump(data, f, allow_unicode=True)
|
|
33
|
+
# read
|
|
34
|
+
with open(config_file, "r") as f:
|
|
35
|
+
data = yaml.safe_load(f.read())
|
|
36
|
+
# overwrite preventing repetition
|
|
37
|
+
with open(config_file, "w") as f:
|
|
38
|
+
yaml.dump(data, f, allow_unicode=True)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_param(parent_param, param, default_app_dir):
|
|
42
|
+
default_dirs = create_and_read_config_file(
|
|
43
|
+
file_name="config.yaml", default_app_dir=default_app_dir
|
|
44
|
+
)[parent_param]
|
|
45
|
+
|
|
46
|
+
if param in default_dirs:
|
|
47
|
+
return default_dirs[param]
|
|
48
|
+
else:
|
|
49
|
+
raise Exception(f"{param} do not exist in your params {parent_param}.")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def config_file_exists(file_name, default_app_dir):
|
|
53
|
+
return os.path.exists(config_file_path(file_name, default_app_dir))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def config_file_path(file_name, default_app_dir):
|
|
57
|
+
_ensure_default_app_dir(default_app_dir)
|
|
58
|
+
config_file = os.path.join(default_app_dir, file_name)
|
|
59
|
+
return config_file
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _ensure_default_app_dir(default_app_dir):
|
|
63
|
+
if not os.path.exists(default_app_dir):
|
|
64
|
+
os.mkdir(default_app_dir)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _get_default_file(default_file):
|
|
68
|
+
default_files_dir = os.path.join(
|
|
69
|
+
os.path.dirname(os.path.realpath(__file__)), "default_files"
|
|
70
|
+
)
|
|
71
|
+
default_file = os.path.join(default_files_dir, default_file)
|
|
72
|
+
with open(default_file, "r") as f:
|
|
73
|
+
return yaml.safe_load(f.read())
|
|
@@ -4,27 +4,37 @@ import time
|
|
|
4
4
|
import glob
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from urllib.parse import urlparse, urlunparse
|
|
7
|
+
from ohmyscrapper.core import config
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def get_db_dir():
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
db_folder = config.get_dir("db")
|
|
12
|
+
if not os.path.exists(db_folder):
|
|
13
|
+
os.mkdir(db_folder)
|
|
14
|
+
return db_folder
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
def get_db_path():
|
|
16
|
-
|
|
18
|
+
db_file = config.get_db()
|
|
19
|
+
return os.path.join(get_db_dir(), db_file)
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
def get_db_connection():
|
|
23
|
+
if not os.path.exists(get_db_path()):
|
|
24
|
+
create_tables(sqlite3.connect(get_db_path()))
|
|
20
25
|
return sqlite3.connect(get_db_path())
|
|
21
26
|
|
|
22
27
|
|
|
23
|
-
|
|
24
|
-
|
|
28
|
+
def use_connection(func):
|
|
29
|
+
def provide_connection(*args, **kwargs):
|
|
30
|
+
global conn
|
|
31
|
+
with get_db_connection() as conn:
|
|
32
|
+
return func(*args, **kwargs)
|
|
25
33
|
|
|
34
|
+
return provide_connection
|
|
26
35
|
|
|
27
|
-
|
|
36
|
+
|
|
37
|
+
def create_tables(conn):
|
|
28
38
|
|
|
29
39
|
c = conn.cursor()
|
|
30
40
|
c.execute(
|
|
@@ -38,27 +48,17 @@ def create_tables():
|
|
|
38
48
|
"CREATE TABLE IF NOT EXISTS urls_valid_prefix (id INTEGER PRIMARY KEY, url_prefix TEXT UNIQUE, url_type TEXT)"
|
|
39
49
|
)
|
|
40
50
|
|
|
41
|
-
return pd.read_sql_query("SELECT * FROM urls LIMIT 100", conn)
|
|
42
|
-
|
|
43
51
|
|
|
44
|
-
|
|
45
|
-
def seeds():
|
|
46
|
-
create_tables()
|
|
52
|
+
def seeds(seeds={}):
|
|
47
53
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
add_urls_valid_prefix("https://%.linkedin.com/jobs/view/%", "linkedin_job")
|
|
51
|
-
add_urls_valid_prefix("https://%.linkedin.com/feed/%", "linkedin_feed")
|
|
52
|
-
add_urls_valid_prefix("https://%.linkedin.com/company/%", "linkedin_company")
|
|
53
|
-
|
|
54
|
-
# add_urls_valid_prefix("%.pdf", "pdf")
|
|
55
|
-
# add_url('https://imazon.org.br/categorias/artigos-cientificos/')
|
|
54
|
+
for url_type, url_prefix in seeds.items():
|
|
55
|
+
add_urls_valid_prefix(url_prefix, url_type)
|
|
56
56
|
|
|
57
57
|
return True
|
|
58
58
|
|
|
59
59
|
|
|
60
|
+
@use_connection
|
|
60
61
|
def add_urls_valid_prefix(url_prefix, url_type):
|
|
61
|
-
conn = get_db_connection()
|
|
62
62
|
|
|
63
63
|
df = pd.read_sql_query(
|
|
64
64
|
f"SELECT * FROM urls_valid_prefix WHERE url_prefix = '{url_prefix}'", conn
|
|
@@ -72,6 +72,7 @@ def add_urls_valid_prefix(url_prefix, url_type):
|
|
|
72
72
|
conn.commit()
|
|
73
73
|
|
|
74
74
|
|
|
75
|
+
@use_connection
|
|
75
76
|
def get_urls_valid_prefix_by_type(url_type):
|
|
76
77
|
df = pd.read_sql_query(
|
|
77
78
|
f"SELECT * FROM urls_valid_prefix WHERE url_type = '{url_type}'", conn
|
|
@@ -79,12 +80,14 @@ def get_urls_valid_prefix_by_type(url_type):
|
|
|
79
80
|
return df
|
|
80
81
|
|
|
81
82
|
|
|
83
|
+
@use_connection
|
|
82
84
|
def get_urls_valid_prefix_by_id(id):
|
|
83
85
|
df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix WHERE id = '{id}'", conn)
|
|
84
86
|
return df
|
|
85
87
|
|
|
86
88
|
|
|
87
89
|
# TODO: pagination required
|
|
90
|
+
@use_connection
|
|
88
91
|
def get_urls_valid_prefix(limit=0):
|
|
89
92
|
if limit > 0:
|
|
90
93
|
df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix LIMIT {limit}", conn)
|
|
@@ -94,6 +97,7 @@ def get_urls_valid_prefix(limit=0):
|
|
|
94
97
|
|
|
95
98
|
|
|
96
99
|
# TODO: pagination required
|
|
100
|
+
@use_connection
|
|
97
101
|
def get_urls(limit=0):
|
|
98
102
|
if limit > 0:
|
|
99
103
|
df = pd.read_sql_query(
|
|
@@ -104,6 +108,7 @@ def get_urls(limit=0):
|
|
|
104
108
|
return df
|
|
105
109
|
|
|
106
110
|
|
|
111
|
+
@use_connection
|
|
107
112
|
def get_urls_report():
|
|
108
113
|
sql = """
|
|
109
114
|
WITH parent_url AS (
|
|
@@ -138,6 +143,7 @@ def get_urls_report():
|
|
|
138
143
|
return df
|
|
139
144
|
|
|
140
145
|
|
|
146
|
+
@use_connection
|
|
141
147
|
def get_url_by_url(url):
|
|
142
148
|
url = clean_url(url)
|
|
143
149
|
df = pd.read_sql_query(f"SELECT * FROM urls WHERE url = '{url}'", conn)
|
|
@@ -145,12 +151,14 @@ def get_url_by_url(url):
|
|
|
145
151
|
return df
|
|
146
152
|
|
|
147
153
|
|
|
154
|
+
@use_connection
|
|
148
155
|
def get_url_by_id(id):
|
|
149
156
|
df = pd.read_sql_query(f"SELECT * FROM urls WHERE id = '{id}'", conn)
|
|
150
157
|
|
|
151
158
|
return df
|
|
152
159
|
|
|
153
160
|
|
|
161
|
+
@use_connection
|
|
154
162
|
def get_urls_by_url_type(url_type):
|
|
155
163
|
df = pd.read_sql_query(
|
|
156
164
|
f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}'", conn
|
|
@@ -158,6 +166,7 @@ def get_urls_by_url_type(url_type):
|
|
|
158
166
|
return df
|
|
159
167
|
|
|
160
168
|
|
|
169
|
+
@use_connection
|
|
161
170
|
def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
|
|
162
171
|
df = pd.read_sql_query(
|
|
163
172
|
f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}' AND ai_processed = 0 LIMIT {limit}",
|
|
@@ -166,6 +175,7 @@ def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
|
|
|
166
175
|
return df
|
|
167
176
|
|
|
168
177
|
|
|
178
|
+
@use_connection
|
|
169
179
|
def get_url_like_unclassified(like_condition):
|
|
170
180
|
df = pd.read_sql_query(
|
|
171
181
|
f"SELECT * FROM urls WHERE history = 0 AND url LIKE '{like_condition}' AND url_type IS NULL",
|
|
@@ -174,6 +184,7 @@ def get_url_like_unclassified(like_condition):
|
|
|
174
184
|
return df
|
|
175
185
|
|
|
176
186
|
|
|
187
|
+
@use_connection
|
|
177
188
|
def add_url(url, h1=None, parent_url=None):
|
|
178
189
|
url = clean_url(url)
|
|
179
190
|
c = conn.cursor()
|
|
@@ -196,6 +207,7 @@ def add_url(url, h1=None, parent_url=None):
|
|
|
196
207
|
return get_url_by_url(url)
|
|
197
208
|
|
|
198
209
|
|
|
210
|
+
@use_connection
|
|
199
211
|
def add_ai_log(instructions, response, model, prompt_file, prompt_name):
|
|
200
212
|
c = conn.cursor()
|
|
201
213
|
|
|
@@ -205,10 +217,14 @@ def add_ai_log(instructions, response, model, prompt_file, prompt_name):
|
|
|
205
217
|
)
|
|
206
218
|
conn.commit()
|
|
207
219
|
|
|
220
|
+
|
|
221
|
+
@use_connection
|
|
208
222
|
def get_ai_log():
|
|
209
223
|
df = pd.read_sql_query(f"SELECT * FROM ai_log", conn)
|
|
210
224
|
return df
|
|
211
225
|
|
|
226
|
+
|
|
227
|
+
@use_connection
|
|
212
228
|
def set_url_destiny(url, destiny):
|
|
213
229
|
url = clean_url(url)
|
|
214
230
|
destiny = clean_url(destiny)
|
|
@@ -222,6 +238,7 @@ def set_url_destiny(url, destiny):
|
|
|
222
238
|
conn.commit()
|
|
223
239
|
|
|
224
240
|
|
|
241
|
+
@use_connection
|
|
225
242
|
def set_url_h1(url, value):
|
|
226
243
|
value = str(value).strip()
|
|
227
244
|
url = clean_url(url)
|
|
@@ -230,6 +247,7 @@ def set_url_h1(url, value):
|
|
|
230
247
|
conn.commit()
|
|
231
248
|
|
|
232
249
|
|
|
250
|
+
@use_connection
|
|
233
251
|
def set_url_h1_by_id(id, value):
|
|
234
252
|
value = str(value).strip()
|
|
235
253
|
|
|
@@ -238,29 +256,44 @@ def set_url_h1_by_id(id, value):
|
|
|
238
256
|
conn.commit()
|
|
239
257
|
|
|
240
258
|
|
|
259
|
+
@use_connection
|
|
241
260
|
def set_url_ai_processed_by_id(id, json_str):
|
|
242
261
|
value = 1
|
|
243
262
|
value = str(value).strip()
|
|
244
263
|
c = conn.cursor()
|
|
245
|
-
c.execute(
|
|
264
|
+
c.execute(
|
|
265
|
+
"UPDATE urls SET ai_processed = ? , json_ai = ? WHERE id = ?",
|
|
266
|
+
(value, json_str, id),
|
|
267
|
+
)
|
|
246
268
|
conn.commit()
|
|
247
269
|
|
|
270
|
+
|
|
271
|
+
@use_connection
|
|
248
272
|
def set_url_empty_ai_processed_by_id(id, json_str="empty result"):
|
|
249
273
|
value = 1
|
|
250
274
|
value = str(value).strip()
|
|
251
275
|
c = conn.cursor()
|
|
252
|
-
c.execute(
|
|
276
|
+
c.execute(
|
|
277
|
+
"UPDATE urls SET ai_processed = ? , json_ai = ? WHERE ai_processed = 0 AND id = ?",
|
|
278
|
+
(value, json_str, id),
|
|
279
|
+
)
|
|
253
280
|
conn.commit()
|
|
254
281
|
|
|
282
|
+
|
|
283
|
+
@use_connection
|
|
255
284
|
def set_url_ai_processed_by_url(url, json_str):
|
|
256
285
|
value = 1
|
|
257
286
|
value = str(value).strip()
|
|
258
287
|
url = clean_url(url)
|
|
259
288
|
c = conn.cursor()
|
|
260
|
-
c.execute(
|
|
289
|
+
c.execute(
|
|
290
|
+
"UPDATE urls SET ai_processed = ?, json_ai = ? WHERE url = ?",
|
|
291
|
+
(value, json_str, url),
|
|
292
|
+
)
|
|
261
293
|
conn.commit()
|
|
262
294
|
|
|
263
295
|
|
|
296
|
+
@use_connection
|
|
264
297
|
def set_url_description(url, value):
|
|
265
298
|
url = clean_url(url)
|
|
266
299
|
c = conn.cursor()
|
|
@@ -268,6 +301,7 @@ def set_url_description(url, value):
|
|
|
268
301
|
conn.commit()
|
|
269
302
|
|
|
270
303
|
|
|
304
|
+
@use_connection
|
|
271
305
|
def set_url_description_links(url, value):
|
|
272
306
|
url = clean_url(url)
|
|
273
307
|
c = conn.cursor()
|
|
@@ -275,6 +309,7 @@ def set_url_description_links(url, value):
|
|
|
275
309
|
conn.commit()
|
|
276
310
|
|
|
277
311
|
|
|
312
|
+
@use_connection
|
|
278
313
|
def set_url_json(url, value):
|
|
279
314
|
url = clean_url(url)
|
|
280
315
|
c = conn.cursor()
|
|
@@ -282,6 +317,7 @@ def set_url_json(url, value):
|
|
|
282
317
|
conn.commit()
|
|
283
318
|
|
|
284
319
|
|
|
320
|
+
@use_connection
|
|
285
321
|
def set_url_error(url, value):
|
|
286
322
|
url = clean_url(url)
|
|
287
323
|
c = conn.cursor()
|
|
@@ -289,6 +325,7 @@ def set_url_error(url, value):
|
|
|
289
325
|
conn.commit()
|
|
290
326
|
|
|
291
327
|
|
|
328
|
+
@use_connection
|
|
292
329
|
def set_url_type_by_id(url_id, url_type):
|
|
293
330
|
c = conn.cursor()
|
|
294
331
|
c.execute(f"UPDATE urls SET url_type = '{url_type}' WHERE id = {url_id}")
|
|
@@ -312,6 +349,7 @@ def clean_url(url):
|
|
|
312
349
|
return url
|
|
313
350
|
|
|
314
351
|
|
|
352
|
+
@use_connection
|
|
315
353
|
def get_untouched_urls(
|
|
316
354
|
limit=10, randomize=True, ignore_valid_prefix=False, only_parents=True
|
|
317
355
|
):
|
|
@@ -331,6 +369,7 @@ def get_untouched_urls(
|
|
|
331
369
|
return df
|
|
332
370
|
|
|
333
371
|
|
|
372
|
+
@use_connection
|
|
334
373
|
def touch_url(url):
|
|
335
374
|
url = clean_url(url)
|
|
336
375
|
c = conn.cursor()
|
|
@@ -338,6 +377,7 @@ def touch_url(url):
|
|
|
338
377
|
conn.commit()
|
|
339
378
|
|
|
340
379
|
|
|
380
|
+
@use_connection
|
|
341
381
|
def untouch_url(url):
|
|
342
382
|
url = clean_url(url)
|
|
343
383
|
c = conn.cursor()
|
|
@@ -345,12 +385,14 @@ def untouch_url(url):
|
|
|
345
385
|
conn.commit()
|
|
346
386
|
|
|
347
387
|
|
|
388
|
+
@use_connection
|
|
348
389
|
def untouch_all_urls():
|
|
349
390
|
c = conn.cursor()
|
|
350
391
|
c.execute("UPDATE urls SET last_touch = NULL WHERE history = 0")
|
|
351
392
|
conn.commit()
|
|
352
393
|
|
|
353
394
|
|
|
395
|
+
@use_connection
|
|
354
396
|
def set_all_urls_as_history():
|
|
355
397
|
c = conn.cursor()
|
|
356
398
|
c.execute("UPDATE urls SET history = 1")
|
|
@@ -382,9 +424,9 @@ def merge_dbs() -> None:
|
|
|
382
424
|
row["description"],
|
|
383
425
|
row["json"],
|
|
384
426
|
)
|
|
385
|
-
# ßmerge_url(df)
|
|
386
427
|
|
|
387
428
|
|
|
429
|
+
@use_connection
|
|
388
430
|
def merge_url(url, h1, last_touch, created_at, description, json):
|
|
389
431
|
url = clean_url(url)
|
|
390
432
|
c = conn.cursor()
|
|
@@ -9,15 +9,19 @@ def classify_urls(recursive=False):
|
|
|
9
9
|
|
|
10
10
|
keep_alive = True
|
|
11
11
|
while keep_alive:
|
|
12
|
-
print("
|
|
12
|
+
print("#️⃣ URL Classifier woke up to classify urls!")
|
|
13
13
|
for index, row_prefix in df.iterrows():
|
|
14
|
-
df_urls = urls_manager.get_url_like_unclassified(
|
|
14
|
+
df_urls = urls_manager.get_url_like_unclassified(
|
|
15
|
+
like_condition=row_prefix["url_prefix"]
|
|
16
|
+
)
|
|
15
17
|
for index, row_urls in df_urls.iterrows():
|
|
16
|
-
urls_manager.set_url_type_by_id(
|
|
18
|
+
urls_manager.set_url_type_by_id(
|
|
19
|
+
url_id=row_urls["id"], url_type=row_prefix["url_type"]
|
|
20
|
+
)
|
|
17
21
|
|
|
18
22
|
if not recursive:
|
|
19
|
-
print("
|
|
23
|
+
print("#️⃣ URL Classifier said: I'm done! See you soon...")
|
|
20
24
|
keep_alive = False
|
|
21
25
|
else:
|
|
22
|
-
print("
|
|
26
|
+
print("#️⃣ URL Classifier is taking a nap...")
|
|
23
27
|
time.sleep(10)
|