ohmyscrapper 0.2.3__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ohmyscrapper/__init__.py +44 -22
- ohmyscrapper/core/config.py +107 -0
- ohmyscrapper/core/config_files.py +73 -0
- ohmyscrapper/core/default_files/config.yaml +16 -0
- ohmyscrapper/core/default_files/url_sniffing.yaml +25 -0
- ohmyscrapper/core/default_files/url_types.yaml +5 -0
- ohmyscrapper/models/urls_manager.py +95 -41
- ohmyscrapper/modules/classify_urls.py +14 -6
- ohmyscrapper/modules/load_txt.py +79 -11
- ohmyscrapper/modules/process_with_ai.py +72 -36
- ohmyscrapper/modules/scrap_urls.py +130 -121
- ohmyscrapper/modules/seed.py +28 -2
- ohmyscrapper/modules/show.py +22 -14
- ohmyscrapper/modules/sniff_url.py +112 -45
- ohmyscrapper/modules/untouch_all.py +1 -1
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.7.0.dist-info}/METADATA +21 -15
- ohmyscrapper-0.7.0.dist-info/RECORD +21 -0
- ohmyscrapper-0.2.3.dist-info/RECORD +0 -16
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.7.0.dist-info}/WHEEL +0 -0
- {ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.7.0.dist-info}/entry_points.txt +0 -0
ohmyscrapper/__init__.py
CHANGED
|
@@ -3,7 +3,7 @@ import argparse
|
|
|
3
3
|
from ohmyscrapper.modules.classify_urls import classify_urls
|
|
4
4
|
from ohmyscrapper.modules.sniff_url import sniff_url
|
|
5
5
|
from ohmyscrapper.modules.load_txt import load_txt
|
|
6
|
-
from ohmyscrapper.modules.seed import seed
|
|
6
|
+
from ohmyscrapper.modules.seed import seed, export_url_types_to_file
|
|
7
7
|
from ohmyscrapper.modules.scrap_urls import scrap_urls
|
|
8
8
|
from ohmyscrapper.modules.show import (
|
|
9
9
|
show_url,
|
|
@@ -15,30 +15,40 @@ from ohmyscrapper.modules.show import (
|
|
|
15
15
|
from ohmyscrapper.modules.untouch_all import untouch_all
|
|
16
16
|
from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
|
|
17
17
|
from ohmyscrapper.modules.merge_dbs import merge_dbs
|
|
18
|
+
from ohmyscrapper.core.config import update
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
def main():
|
|
21
22
|
parser = argparse.ArgumentParser(prog="ohmyscrapper")
|
|
22
|
-
parser.add_argument("--version", action="version", version="%(prog)s v0.
|
|
23
|
+
parser.add_argument("--version", action="version", version="%(prog)s v0.7.0")
|
|
23
24
|
|
|
25
|
+
update()
|
|
24
26
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
25
27
|
start_parser = subparsers.add_parser(
|
|
26
|
-
"start",
|
|
28
|
+
"start",
|
|
29
|
+
help="Make the entire process of 📦 loading, 🐶 scraping and 📜🖋️ exporting with the default configuration.",
|
|
27
30
|
)
|
|
28
31
|
|
|
29
32
|
start_parser.add_argument(
|
|
30
|
-
"--ai",
|
|
33
|
+
"--ai",
|
|
34
|
+
default=False,
|
|
35
|
+
help="Make the entire process of loading, processing, reprocessing with AI and exporting with the default configuration.",
|
|
36
|
+
action="store_true",
|
|
31
37
|
)
|
|
32
38
|
|
|
33
|
-
ai_process_parser = subparsers.add_parser(
|
|
34
|
-
"ai", help="Process with AI."
|
|
35
|
-
)
|
|
39
|
+
ai_process_parser = subparsers.add_parser("ai", help="Process with AI.")
|
|
36
40
|
ai_process_parser.add_argument(
|
|
37
41
|
"--history", default=False, help="Reprocess ai history", action="store_true"
|
|
38
42
|
)
|
|
39
43
|
|
|
40
44
|
seed_parser = subparsers.add_parser(
|
|
41
|
-
"seed", help="Seed database
|
|
45
|
+
"seed", help="Seed database with `url_types` to classify the `urls`."
|
|
46
|
+
)
|
|
47
|
+
seed_parser.add_argument(
|
|
48
|
+
"--export",
|
|
49
|
+
default=False,
|
|
50
|
+
help="Add all `url_types` from the bank to the `/ohmyscrapper/url_types.yaml` file.",
|
|
51
|
+
action="store_true",
|
|
42
52
|
)
|
|
43
53
|
untouch_parser = subparsers.add_parser(
|
|
44
54
|
"untouch-all", help="Untouch all urls. That resets classification"
|
|
@@ -51,12 +61,15 @@ def main():
|
|
|
51
61
|
"--recursive", default=False, help="Run in recursive mode", action="store_true"
|
|
52
62
|
)
|
|
53
63
|
|
|
54
|
-
load_txt_parser = subparsers.add_parser("load", help="Load txt file")
|
|
64
|
+
load_txt_parser = subparsers.add_parser("load", help="📦 Load txt file")
|
|
65
|
+
load_txt_parser.add_argument(
|
|
66
|
+
"-input", default=None, help="File/Folder path or url."
|
|
67
|
+
)
|
|
55
68
|
load_txt_parser.add_argument(
|
|
56
|
-
"
|
|
69
|
+
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
57
70
|
)
|
|
58
71
|
|
|
59
|
-
scrap_urls_parser = subparsers.add_parser("scrap-urls", help="Scrap urls")
|
|
72
|
+
scrap_urls_parser = subparsers.add_parser("scrap-urls", help="🐶 Scrap urls")
|
|
60
73
|
scrap_urls_parser.add_argument(
|
|
61
74
|
"--recursive", default=False, help="Run in recursive mode", action="store_true"
|
|
62
75
|
)
|
|
@@ -69,8 +82,11 @@ def main():
|
|
|
69
82
|
scrap_urls_parser.add_argument(
|
|
70
83
|
"--only-parents", default=False, help="Only parents urls", action="store_true"
|
|
71
84
|
)
|
|
85
|
+
scrap_urls_parser.add_argument(
|
|
86
|
+
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
87
|
+
)
|
|
72
88
|
|
|
73
|
-
sniff_url_parser = subparsers.add_parser("sniff-url", help="Check url")
|
|
89
|
+
sniff_url_parser = subparsers.add_parser("sniff-url", help="🐕 Sniff/Check url")
|
|
74
90
|
sniff_url_parser.add_argument(
|
|
75
91
|
"url", default="https://cesarcardoso.cc/", help="Url to sniff"
|
|
76
92
|
)
|
|
@@ -82,7 +98,7 @@ def main():
|
|
|
82
98
|
show_urls_parser.add_argument("--limit", default=0, help="Limit of lines to show")
|
|
83
99
|
show_urls_parser.add_argument("-url", default="", help="Url to show")
|
|
84
100
|
|
|
85
|
-
export_parser = subparsers.add_parser("export", help="Export urls to csv.")
|
|
101
|
+
export_parser = subparsers.add_parser("export", help="📊🖋️ Export urls to csv.")
|
|
86
102
|
export_parser.add_argument("--limit", default=0, help="Limit of lines to export")
|
|
87
103
|
export_parser.add_argument(
|
|
88
104
|
"--file",
|
|
@@ -96,14 +112,11 @@ def main():
|
|
|
96
112
|
action="store_true",
|
|
97
113
|
)
|
|
98
114
|
|
|
99
|
-
report_parser = subparsers.add_parser(
|
|
115
|
+
report_parser = subparsers.add_parser(
|
|
116
|
+
"report", help="📜🖋️ Export urls report to csv."
|
|
117
|
+
)
|
|
100
118
|
merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
|
|
101
119
|
|
|
102
|
-
# TODO: What is that?
|
|
103
|
-
# seed_parser.set_defaults(func=seed)
|
|
104
|
-
# classify_urls_parser.set_defaults(func=classify_urls)
|
|
105
|
-
# load_txt_parser.set_defaults(func=load_txt)
|
|
106
|
-
|
|
107
120
|
args = parser.parse_args()
|
|
108
121
|
|
|
109
122
|
if args.command == "classify-urls":
|
|
@@ -111,11 +124,14 @@ def main():
|
|
|
111
124
|
return
|
|
112
125
|
|
|
113
126
|
if args.command == "load":
|
|
114
|
-
load_txt(args.
|
|
127
|
+
load_txt(file_name=args.input, verbose=args.verbose)
|
|
115
128
|
return
|
|
116
129
|
|
|
117
130
|
if args.command == "seed":
|
|
118
|
-
|
|
131
|
+
if args.export:
|
|
132
|
+
export_url_types_to_file()
|
|
133
|
+
else:
|
|
134
|
+
seed()
|
|
119
135
|
return
|
|
120
136
|
|
|
121
137
|
if args.command == "untouch-all":
|
|
@@ -132,6 +148,7 @@ def main():
|
|
|
132
148
|
ignore_valid_prefix=args.ignore_type,
|
|
133
149
|
randomize=args.randomize,
|
|
134
150
|
only_parents=args.only_parents,
|
|
151
|
+
verbose=args.verbose,
|
|
135
152
|
)
|
|
136
153
|
return
|
|
137
154
|
|
|
@@ -166,7 +183,12 @@ def main():
|
|
|
166
183
|
|
|
167
184
|
if args.command == "start":
|
|
168
185
|
load_txt()
|
|
169
|
-
scrap_urls(
|
|
186
|
+
scrap_urls(
|
|
187
|
+
recursive=True,
|
|
188
|
+
ignore_valid_prefix=True,
|
|
189
|
+
randomize=False,
|
|
190
|
+
only_parents=False,
|
|
191
|
+
)
|
|
170
192
|
if args.ai:
|
|
171
193
|
process_with_ai()
|
|
172
194
|
export_urls()
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from ohmyscrapper.core import config_files
|
|
3
|
+
|
|
4
|
+
default_app_dir = "ohmyscrapper"
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_dir(param="ohmyscrapper"):
|
|
8
|
+
parent_param = "default_dirs"
|
|
9
|
+
|
|
10
|
+
if param == default_app_dir:
|
|
11
|
+
folder = "./" + param
|
|
12
|
+
else:
|
|
13
|
+
folder = config_files.get_param(
|
|
14
|
+
parent_param=parent_param, param=param, default_app_dir=default_app_dir
|
|
15
|
+
)
|
|
16
|
+
if not os.path.exists(folder):
|
|
17
|
+
os.mkdir(folder)
|
|
18
|
+
return folder
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_files(param):
|
|
22
|
+
parent_param = "default_files"
|
|
23
|
+
return config_files.get_param(
|
|
24
|
+
parent_param=parent_param, param=param, default_app_dir=default_app_dir
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_db(param="db_file"):
|
|
29
|
+
if param == "folder":
|
|
30
|
+
return get_dir(param="db")
|
|
31
|
+
return config_files.get_param(
|
|
32
|
+
parent_param="db", param=param, default_app_dir=default_app_dir
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_ai(param):
|
|
37
|
+
return config_files.get_param(
|
|
38
|
+
parent_param="ai", param=param, default_app_dir=default_app_dir
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def load_config(force_default=False):
|
|
43
|
+
config_file_name = "config.yaml"
|
|
44
|
+
config_params = config_files.create_and_read_config_file(
|
|
45
|
+
file_name=config_file_name,
|
|
46
|
+
default_app_dir=default_app_dir,
|
|
47
|
+
force_default=force_default,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if config_params is None or "default_dirs" not in config_params:
|
|
51
|
+
config_params = load_config(force_default=True)
|
|
52
|
+
|
|
53
|
+
return config_params
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def url_types_file_exists():
|
|
57
|
+
url_types_file = get_files("url_types")
|
|
58
|
+
return config_files.config_file_exists(
|
|
59
|
+
url_types_file, default_app_dir=default_app_dir
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_url_types():
|
|
64
|
+
url_types_file = get_files("url_types")
|
|
65
|
+
return config_files.create_and_read_config_file(
|
|
66
|
+
url_types_file, default_app_dir=default_app_dir
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_url_sniffing():
|
|
71
|
+
file = get_files("url_sniffing")
|
|
72
|
+
return config_files.create_and_read_config_file(
|
|
73
|
+
file, default_app_dir=default_app_dir
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def append_url_sniffing(data):
|
|
78
|
+
file = get_files("url_sniffing")
|
|
79
|
+
_append_config_file(data, file)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def append_url_types(url_types):
|
|
83
|
+
url_types_file = get_files("url_types")
|
|
84
|
+
_append_config_file(url_types, url_types_file)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def overwrite_config_file(data, file_name):
|
|
88
|
+
config_files.overwrite_config_file(data, file_name, default_app_dir=default_app_dir)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _append_config_file(data, file_name):
|
|
92
|
+
config_files.append_config_file(data, file_name, default_app_dir=default_app_dir)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def update():
|
|
96
|
+
legacy_folder = "./customize"
|
|
97
|
+
new_folder = "./ohmyscrapper"
|
|
98
|
+
if os.path.exists(legacy_folder) and not os.path.exists(new_folder):
|
|
99
|
+
yes_no = input(
|
|
100
|
+
"We detected a legacy folder system for your OhMyScrapper, would you like to update? \n"
|
|
101
|
+
"If you don't update, a new version will be used and your legacy folder will be ignored. \n"
|
|
102
|
+
"[Y] for yes or any other thing to ignore: "
|
|
103
|
+
)
|
|
104
|
+
if yes_no == "Y":
|
|
105
|
+
os.rename(legacy_folder, new_folder)
|
|
106
|
+
print(" You are up-to-date! =)")
|
|
107
|
+
print("")
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import yaml
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def create_and_read_config_file(file_name, default_app_dir, force_default=False):
|
|
6
|
+
config_file = config_file_path(file_name, default_app_dir)
|
|
7
|
+
if force_default or not os.path.exists(config_file):
|
|
8
|
+
config_params = _get_default_file(default_file=file_name)
|
|
9
|
+
overwrite_config_file(
|
|
10
|
+
data=config_params, file_name=file_name, default_app_dir=default_app_dir
|
|
11
|
+
)
|
|
12
|
+
else:
|
|
13
|
+
with open(config_file, "r") as f:
|
|
14
|
+
config_params = yaml.safe_load(f.read())
|
|
15
|
+
if config_params is None:
|
|
16
|
+
config_params = create_and_read_config_file(
|
|
17
|
+
file_name=file_name, default_app_dir=default_app_dir, force_default=True
|
|
18
|
+
)
|
|
19
|
+
return config_params
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def overwrite_config_file(data, file_name, default_app_dir):
|
|
23
|
+
config_file = config_file_path(file_name, default_app_dir)
|
|
24
|
+
with open(config_file, "+w") as f:
|
|
25
|
+
f.write(yaml.safe_dump(data))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def append_config_file(data, file_name, default_app_dir):
|
|
29
|
+
config_file = config_file_path(file_name, default_app_dir)
|
|
30
|
+
# append
|
|
31
|
+
with open(config_file, "+a") as f:
|
|
32
|
+
yaml.dump(data, f, allow_unicode=True)
|
|
33
|
+
# read
|
|
34
|
+
with open(config_file, "r") as f:
|
|
35
|
+
data = yaml.safe_load(f.read())
|
|
36
|
+
# overwrite preventing repetition
|
|
37
|
+
with open(config_file, "w") as f:
|
|
38
|
+
yaml.dump(data, f, allow_unicode=True)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_param(parent_param, param, default_app_dir):
|
|
42
|
+
default_dirs = create_and_read_config_file(
|
|
43
|
+
file_name="config.yaml", default_app_dir=default_app_dir
|
|
44
|
+
)[parent_param]
|
|
45
|
+
|
|
46
|
+
if param in default_dirs:
|
|
47
|
+
return default_dirs[param]
|
|
48
|
+
else:
|
|
49
|
+
raise Exception(f"{param} do not exist in your params {parent_param}.")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def config_file_exists(file_name, default_app_dir):
|
|
53
|
+
return os.path.exists(config_file_path(file_name, default_app_dir))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def config_file_path(file_name, default_app_dir):
|
|
57
|
+
_ensure_default_app_dir(default_app_dir)
|
|
58
|
+
config_file = os.path.join(default_app_dir, file_name)
|
|
59
|
+
return config_file
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _ensure_default_app_dir(default_app_dir):
|
|
63
|
+
if not os.path.exists(default_app_dir):
|
|
64
|
+
os.mkdir(default_app_dir)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _get_default_file(default_file):
|
|
68
|
+
default_files_dir = os.path.join(
|
|
69
|
+
os.path.dirname(os.path.realpath(__file__)), "default_files"
|
|
70
|
+
)
|
|
71
|
+
default_file = os.path.join(default_files_dir, default_file)
|
|
72
|
+
with open(default_file, "r") as f:
|
|
73
|
+
return yaml.safe_load(f.read())
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
db:
|
|
2
|
+
db_file: local.db
|
|
3
|
+
|
|
4
|
+
default_dirs:
|
|
5
|
+
db: ./db
|
|
6
|
+
input: ./input
|
|
7
|
+
output: ./output
|
|
8
|
+
prompts: ./prompts
|
|
9
|
+
templates: ./templates
|
|
10
|
+
|
|
11
|
+
default_files:
|
|
12
|
+
url_types: url_types.yaml
|
|
13
|
+
url_sniffing: url_sniffing.yaml
|
|
14
|
+
|
|
15
|
+
ai:
|
|
16
|
+
default_prompt_file: prompt.md
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
linkedin_feed:
|
|
2
|
+
metatags:
|
|
3
|
+
og:url: url_destiny
|
|
4
|
+
|
|
5
|
+
linkedin_job:
|
|
6
|
+
bodytags:
|
|
7
|
+
h1: title
|
|
8
|
+
metatags:
|
|
9
|
+
og:title: title
|
|
10
|
+
og:description: description
|
|
11
|
+
description: description
|
|
12
|
+
|
|
13
|
+
linkedin_post:
|
|
14
|
+
bodytags:
|
|
15
|
+
h1: title
|
|
16
|
+
metatags:
|
|
17
|
+
og:title: title
|
|
18
|
+
og:description: description
|
|
19
|
+
description: description
|
|
20
|
+
|
|
21
|
+
linkedin_redirect:
|
|
22
|
+
metatags:
|
|
23
|
+
og:url: url_destiny
|
|
24
|
+
atags:
|
|
25
|
+
first-tag-as-url_destiny: 5
|