ohmyscrapper 0.4.0__tar.gz → 0.7.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.1}/PKG-INFO +2 -2
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.1}/README.md +1 -1
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.1}/pyproject.toml +4 -2
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.1}/src/ohmyscrapper/__init__.py +32 -6
- ohmyscrapper-0.7.1/src/ohmyscrapper/core/config.py +107 -0
- ohmyscrapper-0.7.1/src/ohmyscrapper/core/config_files.py +73 -0
- ohmyscrapper-0.7.1/src/ohmyscrapper/core/default_files/config.yaml +16 -0
- ohmyscrapper-0.7.1/src/ohmyscrapper/core/default_files/url_sniffing.yaml +25 -0
- ohmyscrapper-0.7.1/src/ohmyscrapper/core/default_files/url_types.yaml +5 -0
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.1}/src/ohmyscrapper/models/urls_manager.py +42 -29
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.1}/src/ohmyscrapper/modules/classify_urls.py +5 -1
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.1}/src/ohmyscrapper/modules/load_txt.py +19 -17
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.1}/src/ohmyscrapper/modules/process_with_ai.py +17 -13
- ohmyscrapper-0.7.1/src/ohmyscrapper/modules/scrap_urls.py +187 -0
- ohmyscrapper-0.7.1/src/ohmyscrapper/modules/seed.py +33 -0
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.1}/src/ohmyscrapper/modules/show.py +15 -14
- ohmyscrapper-0.7.1/src/ohmyscrapper/modules/sniff_url.py +155 -0
- ohmyscrapper-0.4.0/src/ohmyscrapper/modules/scrap_urls.py +0 -209
- ohmyscrapper-0.4.0/src/ohmyscrapper/modules/seed.py +0 -7
- ohmyscrapper-0.4.0/src/ohmyscrapper/modules/sniff_url.py +0 -88
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.1}/src/ohmyscrapper/__main__.py +0 -0
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.1}/src/ohmyscrapper/modules/merge_dbs.py +0 -0
- {ohmyscrapper-0.4.0 → ohmyscrapper-0.7.1}/src/ohmyscrapper/modules/untouch_all.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ohmyscrapper
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.1
|
|
4
4
|
Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
|
|
5
5
|
Author: Cesar Cardoso
|
|
6
6
|
Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
|
|
@@ -16,7 +16,7 @@ Requires-Dist: urlextract>=1.9.0
|
|
|
16
16
|
Requires-Python: >=3.11
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
|
|
19
|
-
# 🐶 OhMyScrapper - v0.
|
|
19
|
+
# 🐶 OhMyScrapper - v0.7.1
|
|
20
20
|
|
|
21
21
|
OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
|
|
22
22
|
final report with general information about job positions.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ohmyscrapper"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.7.1"
|
|
4
4
|
description = "OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -29,11 +29,13 @@ build-backend = "uv_build"
|
|
|
29
29
|
[tool.bumpversion]
|
|
30
30
|
tag = true
|
|
31
31
|
tag_name = "v{new_version}"
|
|
32
|
-
pre_commit_hooks = ["uv sync --upgrade", "git add uv.lock"]
|
|
32
|
+
pre_commit_hooks = ["uvx black ./src", "git add src", "git commit -m 'chore: Beautify with black'", "uv sync --upgrade", "git add uv.lock"]
|
|
33
33
|
commit = true
|
|
34
34
|
|
|
35
35
|
[[tool.bumpversion.files]]
|
|
36
36
|
filename = "pyproject.toml"
|
|
37
|
+
search = 'version = "{current_version}"'
|
|
38
|
+
replace = 'version = "{new_version}"'
|
|
37
39
|
|
|
38
40
|
[[tool.bumpversion.files]]
|
|
39
41
|
filename = "README.md"
|
|
@@ -3,7 +3,7 @@ import argparse
|
|
|
3
3
|
from ohmyscrapper.modules.classify_urls import classify_urls
|
|
4
4
|
from ohmyscrapper.modules.sniff_url import sniff_url
|
|
5
5
|
from ohmyscrapper.modules.load_txt import load_txt
|
|
6
|
-
from ohmyscrapper.modules.seed import seed
|
|
6
|
+
from ohmyscrapper.modules.seed import seed, export_url_types_to_file
|
|
7
7
|
from ohmyscrapper.modules.scrap_urls import scrap_urls
|
|
8
8
|
from ohmyscrapper.modules.show import (
|
|
9
9
|
show_url,
|
|
@@ -15,17 +15,22 @@ from ohmyscrapper.modules.show import (
|
|
|
15
15
|
from ohmyscrapper.modules.untouch_all import untouch_all
|
|
16
16
|
from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
|
|
17
17
|
from ohmyscrapper.modules.merge_dbs import merge_dbs
|
|
18
|
+
from ohmyscrapper.core.config import update
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
def main():
|
|
21
22
|
parser = argparse.ArgumentParser(prog="ohmyscrapper")
|
|
22
|
-
parser.add_argument("--version", action="version", version="%(prog)s v0.
|
|
23
|
+
parser.add_argument("--version", action="version", version="%(prog)s v0.7.1")
|
|
23
24
|
|
|
25
|
+
update()
|
|
24
26
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
25
27
|
start_parser = subparsers.add_parser(
|
|
26
28
|
"start",
|
|
27
29
|
help="Make the entire process of 📦 loading, 🐶 scraping and 📜🖋️ exporting with the default configuration.",
|
|
28
30
|
)
|
|
31
|
+
start_parser.add_argument(
|
|
32
|
+
"-input", default=None, help="File/Folder path or url for pre-loading."
|
|
33
|
+
)
|
|
29
34
|
|
|
30
35
|
start_parser.add_argument(
|
|
31
36
|
"--ai",
|
|
@@ -40,7 +45,13 @@ def main():
|
|
|
40
45
|
)
|
|
41
46
|
|
|
42
47
|
seed_parser = subparsers.add_parser(
|
|
43
|
-
"seed", help="Seed database
|
|
48
|
+
"seed", help="Seed database with `url_types` to classify the `urls`."
|
|
49
|
+
)
|
|
50
|
+
seed_parser.add_argument(
|
|
51
|
+
"--export",
|
|
52
|
+
default=False,
|
|
53
|
+
help="Add all `url_types` from the bank to the `/ohmyscrapper/url_types.yaml` file.",
|
|
54
|
+
action="store_true",
|
|
44
55
|
)
|
|
45
56
|
untouch_parser = subparsers.add_parser(
|
|
46
57
|
"untouch-all", help="Untouch all urls. That resets classification"
|
|
@@ -54,7 +65,9 @@ def main():
|
|
|
54
65
|
)
|
|
55
66
|
|
|
56
67
|
load_txt_parser = subparsers.add_parser("load", help="📦 Load txt file")
|
|
57
|
-
load_txt_parser.add_argument(
|
|
68
|
+
load_txt_parser.add_argument(
|
|
69
|
+
"-input", default=None, help="File/Folder path or url."
|
|
70
|
+
)
|
|
58
71
|
load_txt_parser.add_argument(
|
|
59
72
|
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
60
73
|
)
|
|
@@ -75,6 +88,9 @@ def main():
|
|
|
75
88
|
scrap_urls_parser.add_argument(
|
|
76
89
|
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
77
90
|
)
|
|
91
|
+
scrap_urls_parser.add_argument(
|
|
92
|
+
"-input", default=None, help="File/Folder path or url for pre-loading."
|
|
93
|
+
)
|
|
78
94
|
|
|
79
95
|
sniff_url_parser = subparsers.add_parser("sniff-url", help="🐕 Sniff/Check url")
|
|
80
96
|
sniff_url_parser.add_argument(
|
|
@@ -118,7 +134,10 @@ def main():
|
|
|
118
134
|
return
|
|
119
135
|
|
|
120
136
|
if args.command == "seed":
|
|
121
|
-
|
|
137
|
+
if args.export:
|
|
138
|
+
export_url_types_to_file()
|
|
139
|
+
else:
|
|
140
|
+
seed()
|
|
122
141
|
return
|
|
123
142
|
|
|
124
143
|
if args.command == "untouch-all":
|
|
@@ -130,6 +149,9 @@ def main():
|
|
|
130
149
|
return
|
|
131
150
|
|
|
132
151
|
if args.command == "scrap-urls":
|
|
152
|
+
if args.input != None:
|
|
153
|
+
load_txt(file_name=args.input, verbose=args.verbose)
|
|
154
|
+
|
|
133
155
|
scrap_urls(
|
|
134
156
|
recursive=args.recursive,
|
|
135
157
|
ignore_valid_prefix=args.ignore_type,
|
|
@@ -169,7 +191,11 @@ def main():
|
|
|
169
191
|
return
|
|
170
192
|
|
|
171
193
|
if args.command == "start":
|
|
172
|
-
|
|
194
|
+
if args.input != None:
|
|
195
|
+
load_txt(file_name=args.input)
|
|
196
|
+
else:
|
|
197
|
+
load_txt()
|
|
198
|
+
|
|
173
199
|
scrap_urls(
|
|
174
200
|
recursive=True,
|
|
175
201
|
ignore_valid_prefix=True,
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from ohmyscrapper.core import config_files
|
|
3
|
+
|
|
4
|
+
default_app_dir = "ohmyscrapper"
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_dir(param="ohmyscrapper"):
|
|
8
|
+
parent_param = "default_dirs"
|
|
9
|
+
|
|
10
|
+
if param == default_app_dir:
|
|
11
|
+
folder = "./" + param
|
|
12
|
+
else:
|
|
13
|
+
folder = config_files.get_param(
|
|
14
|
+
parent_param=parent_param, param=param, default_app_dir=default_app_dir
|
|
15
|
+
)
|
|
16
|
+
if not os.path.exists(folder):
|
|
17
|
+
os.mkdir(folder)
|
|
18
|
+
return folder
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_files(param):
|
|
22
|
+
parent_param = "default_files"
|
|
23
|
+
return config_files.get_param(
|
|
24
|
+
parent_param=parent_param, param=param, default_app_dir=default_app_dir
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_db(param="db_file"):
|
|
29
|
+
if param == "folder":
|
|
30
|
+
return get_dir(param="db")
|
|
31
|
+
return config_files.get_param(
|
|
32
|
+
parent_param="db", param=param, default_app_dir=default_app_dir
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_ai(param):
|
|
37
|
+
return config_files.get_param(
|
|
38
|
+
parent_param="ai", param=param, default_app_dir=default_app_dir
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def load_config(force_default=False):
|
|
43
|
+
config_file_name = "config.yaml"
|
|
44
|
+
config_params = config_files.create_and_read_config_file(
|
|
45
|
+
file_name=config_file_name,
|
|
46
|
+
default_app_dir=default_app_dir,
|
|
47
|
+
force_default=force_default,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if config_params is None or "default_dirs" not in config_params:
|
|
51
|
+
config_params = load_config(force_default=True)
|
|
52
|
+
|
|
53
|
+
return config_params
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def url_types_file_exists():
|
|
57
|
+
url_types_file = get_files("url_types")
|
|
58
|
+
return config_files.config_file_exists(
|
|
59
|
+
url_types_file, default_app_dir=default_app_dir
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_url_types():
|
|
64
|
+
url_types_file = get_files("url_types")
|
|
65
|
+
return config_files.create_and_read_config_file(
|
|
66
|
+
url_types_file, default_app_dir=default_app_dir
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_url_sniffing():
|
|
71
|
+
file = get_files("url_sniffing")
|
|
72
|
+
return config_files.create_and_read_config_file(
|
|
73
|
+
file, default_app_dir=default_app_dir
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def append_url_sniffing(data):
|
|
78
|
+
file = get_files("url_sniffing")
|
|
79
|
+
_append_config_file(data, file)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def append_url_types(url_types):
|
|
83
|
+
url_types_file = get_files("url_types")
|
|
84
|
+
_append_config_file(url_types, url_types_file)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def overwrite_config_file(data, file_name):
|
|
88
|
+
config_files.overwrite_config_file(data, file_name, default_app_dir=default_app_dir)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _append_config_file(data, file_name):
|
|
92
|
+
config_files.append_config_file(data, file_name, default_app_dir=default_app_dir)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def update():
|
|
96
|
+
legacy_folder = "./customize"
|
|
97
|
+
new_folder = "./ohmyscrapper"
|
|
98
|
+
if os.path.exists(legacy_folder) and not os.path.exists(new_folder):
|
|
99
|
+
yes_no = input(
|
|
100
|
+
"We detected a legacy folder system for your OhMyScrapper, would you like to update? \n"
|
|
101
|
+
"If you don't update, a new version will be used and your legacy folder will be ignored. \n"
|
|
102
|
+
"[Y] for yes or any other thing to ignore: "
|
|
103
|
+
)
|
|
104
|
+
if yes_no == "Y":
|
|
105
|
+
os.rename(legacy_folder, new_folder)
|
|
106
|
+
print(" You are up-to-date! =)")
|
|
107
|
+
print("")
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import yaml
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def create_and_read_config_file(file_name, default_app_dir, force_default=False):
|
|
6
|
+
config_file = config_file_path(file_name, default_app_dir)
|
|
7
|
+
if force_default or not os.path.exists(config_file):
|
|
8
|
+
config_params = _get_default_file(default_file=file_name)
|
|
9
|
+
overwrite_config_file(
|
|
10
|
+
data=config_params, file_name=file_name, default_app_dir=default_app_dir
|
|
11
|
+
)
|
|
12
|
+
else:
|
|
13
|
+
with open(config_file, "r") as f:
|
|
14
|
+
config_params = yaml.safe_load(f.read())
|
|
15
|
+
if config_params is None:
|
|
16
|
+
config_params = create_and_read_config_file(
|
|
17
|
+
file_name=file_name, default_app_dir=default_app_dir, force_default=True
|
|
18
|
+
)
|
|
19
|
+
return config_params
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def overwrite_config_file(data, file_name, default_app_dir):
|
|
23
|
+
config_file = config_file_path(file_name, default_app_dir)
|
|
24
|
+
with open(config_file, "+w") as f:
|
|
25
|
+
f.write(yaml.safe_dump(data))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def append_config_file(data, file_name, default_app_dir):
|
|
29
|
+
config_file = config_file_path(file_name, default_app_dir)
|
|
30
|
+
# append
|
|
31
|
+
with open(config_file, "+a") as f:
|
|
32
|
+
yaml.dump(data, f, allow_unicode=True)
|
|
33
|
+
# read
|
|
34
|
+
with open(config_file, "r") as f:
|
|
35
|
+
data = yaml.safe_load(f.read())
|
|
36
|
+
# overwrite preventing repetition
|
|
37
|
+
with open(config_file, "w") as f:
|
|
38
|
+
yaml.dump(data, f, allow_unicode=True)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_param(parent_param, param, default_app_dir):
|
|
42
|
+
default_dirs = create_and_read_config_file(
|
|
43
|
+
file_name="config.yaml", default_app_dir=default_app_dir
|
|
44
|
+
)[parent_param]
|
|
45
|
+
|
|
46
|
+
if param in default_dirs:
|
|
47
|
+
return default_dirs[param]
|
|
48
|
+
else:
|
|
49
|
+
raise Exception(f"{param} do not exist in your params {parent_param}.")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def config_file_exists(file_name, default_app_dir):
|
|
53
|
+
return os.path.exists(config_file_path(file_name, default_app_dir))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def config_file_path(file_name, default_app_dir):
|
|
57
|
+
_ensure_default_app_dir(default_app_dir)
|
|
58
|
+
config_file = os.path.join(default_app_dir, file_name)
|
|
59
|
+
return config_file
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _ensure_default_app_dir(default_app_dir):
|
|
63
|
+
if not os.path.exists(default_app_dir):
|
|
64
|
+
os.mkdir(default_app_dir)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _get_default_file(default_file):
|
|
68
|
+
default_files_dir = os.path.join(
|
|
69
|
+
os.path.dirname(os.path.realpath(__file__)), "default_files"
|
|
70
|
+
)
|
|
71
|
+
default_file = os.path.join(default_files_dir, default_file)
|
|
72
|
+
with open(default_file, "r") as f:
|
|
73
|
+
return yaml.safe_load(f.read())
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
db:
|
|
2
|
+
db_file: local.db
|
|
3
|
+
|
|
4
|
+
default_dirs:
|
|
5
|
+
db: ./db
|
|
6
|
+
input: ./input
|
|
7
|
+
output: ./output
|
|
8
|
+
prompts: ./prompts
|
|
9
|
+
templates: ./templates
|
|
10
|
+
|
|
11
|
+
default_files:
|
|
12
|
+
url_types: url_types.yaml
|
|
13
|
+
url_sniffing: url_sniffing.yaml
|
|
14
|
+
|
|
15
|
+
ai:
|
|
16
|
+
default_prompt_file: prompt.md
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
linkedin_feed:
|
|
2
|
+
metatags:
|
|
3
|
+
og:url: url_destiny
|
|
4
|
+
|
|
5
|
+
linkedin_job:
|
|
6
|
+
bodytags:
|
|
7
|
+
h1: title
|
|
8
|
+
metatags:
|
|
9
|
+
og:title: title
|
|
10
|
+
og:description: description
|
|
11
|
+
description: description
|
|
12
|
+
|
|
13
|
+
linkedin_post:
|
|
14
|
+
bodytags:
|
|
15
|
+
h1: title
|
|
16
|
+
metatags:
|
|
17
|
+
og:title: title
|
|
18
|
+
og:description: description
|
|
19
|
+
description: description
|
|
20
|
+
|
|
21
|
+
linkedin_redirect:
|
|
22
|
+
metatags:
|
|
23
|
+
og:url: url_destiny
|
|
24
|
+
atags:
|
|
25
|
+
first-tag-as-url_destiny: 5
|
|
@@ -4,16 +4,19 @@ import time
|
|
|
4
4
|
import glob
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from urllib.parse import urlparse, urlunparse
|
|
7
|
+
from ohmyscrapper.core import config
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def get_db_dir():
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
db_folder = config.get_dir("db")
|
|
12
|
+
if not os.path.exists(db_folder):
|
|
13
|
+
os.mkdir(db_folder)
|
|
14
|
+
return db_folder
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
def get_db_path():
|
|
16
|
-
|
|
18
|
+
db_file = config.get_db()
|
|
19
|
+
return os.path.join(get_db_dir(), db_file)
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
def get_db_connection():
|
|
@@ -26,7 +29,11 @@ def use_connection(func):
|
|
|
26
29
|
def provide_connection(*args, **kwargs):
|
|
27
30
|
global conn
|
|
28
31
|
with get_db_connection() as conn:
|
|
29
|
-
|
|
32
|
+
try:
|
|
33
|
+
return func(*args, **kwargs)
|
|
34
|
+
except:
|
|
35
|
+
update_db()
|
|
36
|
+
return func(*args, **kwargs)
|
|
30
37
|
|
|
31
38
|
return provide_connection
|
|
32
39
|
|
|
@@ -35,7 +42,7 @@ def create_tables(conn):
|
|
|
35
42
|
|
|
36
43
|
c = conn.cursor()
|
|
37
44
|
c.execute(
|
|
38
|
-
"CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT,
|
|
45
|
+
"CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, title TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
|
|
39
46
|
)
|
|
40
47
|
c.execute(
|
|
41
48
|
"CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, prompt_file STRING, prompt_name STRING, created_at DATETIME)"
|
|
@@ -46,12 +53,18 @@ def create_tables(conn):
|
|
|
46
53
|
)
|
|
47
54
|
|
|
48
55
|
|
|
49
|
-
def
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
56
|
+
def update_db():
|
|
57
|
+
try:
|
|
58
|
+
c = conn.cursor()
|
|
59
|
+
c.execute("ALTER TABLE urls RENAME COLUMN h1 TO title")
|
|
60
|
+
except:
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def seeds(seeds={}):
|
|
65
|
+
|
|
66
|
+
for url_type, url_prefix in seeds.items():
|
|
67
|
+
add_urls_valid_prefix(url_prefix, url_type)
|
|
55
68
|
|
|
56
69
|
return True
|
|
57
70
|
|
|
@@ -117,7 +130,7 @@ def get_urls_report():
|
|
|
117
130
|
SELECT
|
|
118
131
|
u.id,
|
|
119
132
|
u.url,
|
|
120
|
-
u.
|
|
133
|
+
u.title
|
|
121
134
|
FROM urls u
|
|
122
135
|
INNER JOIN parent_url p
|
|
123
136
|
ON u.url = p.parent_url
|
|
@@ -126,9 +139,9 @@ def get_urls_report():
|
|
|
126
139
|
u.id,
|
|
127
140
|
u.url_type,
|
|
128
141
|
u.url,
|
|
129
|
-
COALESCE(u.
|
|
142
|
+
COALESCE(u.title, p.title) as title,
|
|
130
143
|
p.url as parent_url,
|
|
131
|
-
p.
|
|
144
|
+
p.title as parent_title
|
|
132
145
|
FROM urls u
|
|
133
146
|
LEFT JOIN parents p
|
|
134
147
|
ON u.parent_url = p.url
|
|
@@ -184,12 +197,12 @@ def get_url_like_unclassified(like_condition):
|
|
|
184
197
|
|
|
185
198
|
|
|
186
199
|
@use_connection
|
|
187
|
-
def add_url(url,
|
|
200
|
+
def add_url(url, title=None, parent_url=None):
|
|
188
201
|
url = clean_url(url)
|
|
189
202
|
c = conn.cursor()
|
|
190
203
|
|
|
191
|
-
if
|
|
192
|
-
|
|
204
|
+
if title is not None:
|
|
205
|
+
title = title.strip()
|
|
193
206
|
|
|
194
207
|
if parent_url is None:
|
|
195
208
|
parent_url = None
|
|
@@ -198,8 +211,8 @@ def add_url(url, h1=None, parent_url=None):
|
|
|
198
211
|
|
|
199
212
|
if len(get_url_by_url(url)) == 0:
|
|
200
213
|
c.execute(
|
|
201
|
-
"INSERT INTO urls (url,
|
|
202
|
-
(url,
|
|
214
|
+
"INSERT INTO urls (url, title, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
|
|
215
|
+
(url, title, parent_url, int(time.time())),
|
|
203
216
|
)
|
|
204
217
|
conn.commit()
|
|
205
218
|
|
|
@@ -238,20 +251,20 @@ def set_url_destiny(url, destiny):
|
|
|
238
251
|
|
|
239
252
|
|
|
240
253
|
@use_connection
|
|
241
|
-
def
|
|
254
|
+
def set_url_title(url, value):
|
|
242
255
|
value = str(value).strip()
|
|
243
256
|
url = clean_url(url)
|
|
244
257
|
c = conn.cursor()
|
|
245
|
-
c.execute("UPDATE urls SET
|
|
258
|
+
c.execute("UPDATE urls SET title = ? WHERE url = ?", (value, url))
|
|
246
259
|
conn.commit()
|
|
247
260
|
|
|
248
261
|
|
|
249
262
|
@use_connection
|
|
250
|
-
def
|
|
263
|
+
def set_url_title_by_id(id, value):
|
|
251
264
|
value = str(value).strip()
|
|
252
265
|
|
|
253
266
|
c = conn.cursor()
|
|
254
|
-
c.execute("UPDATE urls SET
|
|
267
|
+
c.execute("UPDATE urls SET title = ? WHERE id = ?", (value, id))
|
|
255
268
|
conn.commit()
|
|
256
269
|
|
|
257
270
|
|
|
@@ -426,16 +439,16 @@ def merge_dbs() -> None:
|
|
|
426
439
|
|
|
427
440
|
|
|
428
441
|
@use_connection
|
|
429
|
-
def merge_url(url,
|
|
442
|
+
def merge_url(url, title, last_touch, created_at, description, json):
|
|
430
443
|
url = clean_url(url)
|
|
431
444
|
c = conn.cursor()
|
|
432
445
|
|
|
433
|
-
if
|
|
434
|
-
|
|
446
|
+
if title is not None:
|
|
447
|
+
title = title.strip()
|
|
435
448
|
|
|
436
449
|
if len(get_url_by_url(url)) == 0:
|
|
437
450
|
c.execute(
|
|
438
|
-
"INSERT INTO urls (url,
|
|
439
|
-
(url,
|
|
451
|
+
"INSERT INTO urls (url, title, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
|
|
452
|
+
(url, title, last_touch, created_at, description, json),
|
|
440
453
|
)
|
|
441
454
|
conn.commit()
|
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
from ohmyscrapper.modules import seed
|
|
2
3
|
import pandas as pd
|
|
3
4
|
import time
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def classify_urls(recursive=False):
|
|
7
|
-
urls_manager.seeds()
|
|
8
8
|
df = urls_manager.get_urls_valid_prefix()
|
|
9
|
+
if len(df) == 0:
|
|
10
|
+
seed.seed()
|
|
11
|
+
classify_urls(recursive=recursive)
|
|
12
|
+
return
|
|
9
13
|
|
|
10
14
|
keep_alive = True
|
|
11
15
|
while keep_alive:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from urlextract import URLExtract
|
|
3
3
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
4
|
+
from ohmyscrapper.core import config
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def _increment_file_name(text_file_content, file_name):
|
|
@@ -9,17 +10,15 @@ def _increment_file_name(text_file_content, file_name):
|
|
|
9
10
|
return text_file_content + f.read()
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
def load_txt(file_name=
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
if not os.path.exists("input"):
|
|
17
|
-
os.mkdir("input")
|
|
13
|
+
def load_txt(file_name="input", verbose=False):
|
|
14
|
+
input_folder = config.get_dir("input")
|
|
15
|
+
if not os.path.exists(input_folder):
|
|
16
|
+
os.mkdir(input_folder)
|
|
18
17
|
|
|
19
18
|
urls_manager.seeds()
|
|
20
19
|
|
|
21
20
|
text_file_content = ""
|
|
22
|
-
if file_name is not None:
|
|
21
|
+
if file_name is not None and not os.path.isdir(file_name):
|
|
23
22
|
print(f"📖 reading file `{file_name}`... ")
|
|
24
23
|
if not os.path.exists(file_name):
|
|
25
24
|
if file_name.startswith("https://") or file_name.startswith("http://"):
|
|
@@ -32,27 +31,30 @@ def load_txt(file_name=None, verbose=False):
|
|
|
32
31
|
text_file_content=text_file_content, file_name=file_name
|
|
33
32
|
)
|
|
34
33
|
else:
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
input_folder = config.get_dir("input")
|
|
35
|
+
print(f"📂 reading {input_folder} directory... ")
|
|
36
|
+
if file_name is None:
|
|
37
|
+
dir_files = input_folder
|
|
38
|
+
else:
|
|
39
|
+
dir_files = file_name
|
|
37
40
|
text_files = os.listdir(dir_files)
|
|
38
41
|
for file in text_files:
|
|
39
42
|
if not file.endswith(".txt"):
|
|
40
43
|
text_files.remove(file)
|
|
41
44
|
if len(text_files) == 0:
|
|
42
|
-
print("No text files found in
|
|
45
|
+
print(f"No text files found in {input_folder} directory!")
|
|
43
46
|
return
|
|
44
47
|
elif len(text_files) == 1:
|
|
45
48
|
print(f"📖 reading file `{dir_files}/{text_files[0]}`... ")
|
|
46
49
|
text_file_content = _increment_file_name(
|
|
47
50
|
text_file_content=text_file_content,
|
|
48
|
-
file_name=dir_files
|
|
51
|
+
file_name=os.path.join(dir_files, text_files[0]),
|
|
49
52
|
)
|
|
50
53
|
else:
|
|
51
54
|
print("\nChoose a text file. Use `*` for process all and `q` to quit:")
|
|
52
55
|
for index, file in enumerate(text_files):
|
|
53
|
-
print(f"[{index}]:", dir_files
|
|
56
|
+
print(f"[{index}]:", os.path.join(dir_files, file))
|
|
54
57
|
|
|
55
|
-
# TODO: there is a better way for sure!
|
|
56
58
|
text_file_option = -1
|
|
57
59
|
while text_file_option < 0 or text_file_option >= len(text_files):
|
|
58
60
|
text_file_option = input("Enter the file number: ")
|
|
@@ -60,7 +62,7 @@ def load_txt(file_name=None, verbose=False):
|
|
|
60
62
|
for file in text_files:
|
|
61
63
|
text_file_content = _increment_file_name(
|
|
62
64
|
text_file_content=text_file_content,
|
|
63
|
-
file_name=dir_files
|
|
65
|
+
file_name=os.path.join(dir_files, file),
|
|
64
66
|
)
|
|
65
67
|
text_file_option = 0
|
|
66
68
|
elif text_file_option == "q":
|
|
@@ -70,9 +72,9 @@ def load_txt(file_name=None, verbose=False):
|
|
|
70
72
|
if text_file_option >= 0 and text_file_option < len(text_files):
|
|
71
73
|
text_file_content = _increment_file_name(
|
|
72
74
|
text_file_content=text_file_content,
|
|
73
|
-
file_name=
|
|
74
|
-
|
|
75
|
-
|
|
75
|
+
file_name=os.path.join(
|
|
76
|
+
dir_files, text_files[int(text_file_option)]
|
|
77
|
+
),
|
|
76
78
|
)
|
|
77
79
|
|
|
78
80
|
print("🔎 looking for urls...")
|