ohmyscrapper 0.2.3__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ohmyscrapper-0.2.3 → ohmyscrapper-0.3.4}/PKG-INFO +14 -14
- {ohmyscrapper-0.2.3 → ohmyscrapper-0.3.4}/README.md +12 -12
- {ohmyscrapper-0.2.3 → ohmyscrapper-0.3.4}/pyproject.toml +2 -2
- {ohmyscrapper-0.2.3 → ohmyscrapper-0.3.4}/src/ohmyscrapper/__init__.py +28 -19
- {ohmyscrapper-0.2.3 → ohmyscrapper-0.3.4}/src/ohmyscrapper/models/urls_manager.py +57 -16
- ohmyscrapper-0.3.4/src/ohmyscrapper/modules/classify_urls.py +27 -0
- ohmyscrapper-0.3.4/src/ohmyscrapper/modules/load_txt.py +94 -0
- {ohmyscrapper-0.2.3 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/process_with_ai.py +63 -31
- {ohmyscrapper-0.2.3 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/scrap_urls.py +80 -49
- {ohmyscrapper-0.2.3 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/seed.py +1 -1
- {ohmyscrapper-0.2.3 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/show.py +11 -4
- {ohmyscrapper-0.2.3 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/untouch_all.py +1 -1
- ohmyscrapper-0.2.3/src/ohmyscrapper/modules/classify_urls.py +0 -23
- ohmyscrapper-0.2.3/src/ohmyscrapper/modules/load_txt.py +0 -32
- {ohmyscrapper-0.2.3 → ohmyscrapper-0.3.4}/src/ohmyscrapper/__main__.py +0 -0
- {ohmyscrapper-0.2.3 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/merge_dbs.py +0 -0
- {ohmyscrapper-0.2.3 → ohmyscrapper-0.3.4}/src/ohmyscrapper/modules/sniff_url.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ohmyscrapper
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.3.4
|
|
4
|
+
Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
|
|
5
5
|
Author: Cesar Cardoso
|
|
6
6
|
Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
|
|
7
7
|
Requires-Dist: beautifulsoup4>=4.14.3
|
|
@@ -16,16 +16,17 @@ Requires-Dist: urlextract>=1.9.0
|
|
|
16
16
|
Requires-Python: >=3.11
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
|
|
19
|
-
# OhMyScrapper - v0.
|
|
19
|
+
# 🐶 OhMyScrapper - v0.3.4
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
final
|
|
21
|
+
OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
|
|
22
|
+
final report with general information about job positions.
|
|
23
23
|
|
|
24
24
|
## Scope
|
|
25
25
|
|
|
26
26
|
- Read texts;
|
|
27
|
-
- Extract
|
|
28
|
-
-
|
|
27
|
+
- Extract and load urls;
|
|
28
|
+
- Scrapes the urls looking for og:tags and titles;
|
|
29
|
+
- Export a list of links with relevant information;
|
|
29
30
|
|
|
30
31
|
## Installation
|
|
31
32
|
|
|
@@ -50,7 +51,7 @@ uvx ohmyscrapper --version
|
|
|
50
51
|
|
|
51
52
|
OhMyScrapper works in 3 stages:
|
|
52
53
|
|
|
53
|
-
1. It collects and loads urls from a text
|
|
54
|
+
1. It collects and loads urls from a text in a database;
|
|
54
55
|
2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
|
|
55
56
|
3. Export a list of urls in CSV files;
|
|
56
57
|
|
|
@@ -58,7 +59,7 @@ You can do 3 stages with the command:
|
|
|
58
59
|
```shell
|
|
59
60
|
ohmyscrapper start
|
|
60
61
|
```
|
|
61
|
-
> Remember to add your text file in the folder `/input` with the name
|
|
62
|
+
> Remember to add your text file in the folder `/input` with the name that finishes with `.txt`!
|
|
62
63
|
|
|
63
64
|
You will find the exported files in the folder `/output` like this:
|
|
64
65
|
- `/output/report.csv`
|
|
@@ -70,15 +71,14 @@ You will find the exported files in the folder `/output` like this:
|
|
|
70
71
|
|
|
71
72
|
### BUT: if you want to do step by step, here it is:
|
|
72
73
|
|
|
73
|
-
First we load a text file you would like to look for urls
|
|
74
|
-
use the whatsapp history, but it works with any txt file.
|
|
74
|
+
First we load a text file you would like to look for urls. It it works with any txt file.
|
|
75
75
|
|
|
76
|
-
The default
|
|
77
|
-
the command `load`:
|
|
76
|
+
The default folder is `/input`. Put one or more text (finished with `.txt`) files
|
|
77
|
+
in this folder and use the command `load`:
|
|
78
78
|
```shell
|
|
79
79
|
ohmyscrapper load
|
|
80
80
|
```
|
|
81
|
-
or, if you have another file, just use the argument `-file` like this:
|
|
81
|
+
or, if you have another file in a different folder, just use the argument `-file` like this:
|
|
82
82
|
```shell
|
|
83
83
|
ohmyscrapper load -file=my-text-file.txt
|
|
84
84
|
```
|
|
@@ -1,13 +1,14 @@
|
|
|
1
|
-
# OhMyScrapper - v0.
|
|
1
|
+
# 🐶 OhMyScrapper - v0.3.4
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
final
|
|
3
|
+
OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
|
|
4
|
+
final report with general information about job positions.
|
|
5
5
|
|
|
6
6
|
## Scope
|
|
7
7
|
|
|
8
8
|
- Read texts;
|
|
9
|
-
- Extract
|
|
10
|
-
-
|
|
9
|
+
- Extract and load urls;
|
|
10
|
+
- Scrapes the urls looking for og:tags and titles;
|
|
11
|
+
- Export a list of links with relevant information;
|
|
11
12
|
|
|
12
13
|
## Installation
|
|
13
14
|
|
|
@@ -32,7 +33,7 @@ uvx ohmyscrapper --version
|
|
|
32
33
|
|
|
33
34
|
OhMyScrapper works in 3 stages:
|
|
34
35
|
|
|
35
|
-
1. It collects and loads urls from a text
|
|
36
|
+
1. It collects and loads urls from a text in a database;
|
|
36
37
|
2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
|
|
37
38
|
3. Export a list of urls in CSV files;
|
|
38
39
|
|
|
@@ -40,7 +41,7 @@ You can do 3 stages with the command:
|
|
|
40
41
|
```shell
|
|
41
42
|
ohmyscrapper start
|
|
42
43
|
```
|
|
43
|
-
> Remember to add your text file in the folder `/input` with the name
|
|
44
|
+
> Remember to add your text file in the folder `/input` with the name that finishes with `.txt`!
|
|
44
45
|
|
|
45
46
|
You will find the exported files in the folder `/output` like this:
|
|
46
47
|
- `/output/report.csv`
|
|
@@ -52,15 +53,14 @@ You will find the exported files in the folder `/output` like this:
|
|
|
52
53
|
|
|
53
54
|
### BUT: if you want to do step by step, here it is:
|
|
54
55
|
|
|
55
|
-
First we load a text file you would like to look for urls
|
|
56
|
-
use the whatsapp history, but it works with any txt file.
|
|
56
|
+
First we load a text file you would like to look for urls. It it works with any txt file.
|
|
57
57
|
|
|
58
|
-
The default
|
|
59
|
-
the command `load`:
|
|
58
|
+
The default folder is `/input`. Put one or more text (finished with `.txt`) files
|
|
59
|
+
in this folder and use the command `load`:
|
|
60
60
|
```shell
|
|
61
61
|
ohmyscrapper load
|
|
62
62
|
```
|
|
63
|
-
or, if you have another file, just use the argument `-file` like this:
|
|
63
|
+
or, if you have another file in a different folder, just use the argument `-file` like this:
|
|
64
64
|
```shell
|
|
65
65
|
ohmyscrapper load -file=my-text-file.txt
|
|
66
66
|
```
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ohmyscrapper"
|
|
3
|
-
version = "0.
|
|
4
|
-
description = "
|
|
3
|
+
version = "0.3.4"
|
|
4
|
+
description = "OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
7
7
|
{ name = "Cesar Cardoso", email = "hello@cesarcardoso.cc" }
|
|
@@ -19,20 +19,22 @@ from ohmyscrapper.modules.merge_dbs import merge_dbs
|
|
|
19
19
|
|
|
20
20
|
def main():
|
|
21
21
|
parser = argparse.ArgumentParser(prog="ohmyscrapper")
|
|
22
|
-
parser.add_argument("--version", action="version", version="%(prog)s v0.
|
|
22
|
+
parser.add_argument("--version", action="version", version="%(prog)s v0.3.4")
|
|
23
23
|
|
|
24
24
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
25
25
|
start_parser = subparsers.add_parser(
|
|
26
|
-
"start",
|
|
26
|
+
"start",
|
|
27
|
+
help="Make the entire process of 📦 loading, 🐶 scraping and 📜🖋️ exporting with the default configuration.",
|
|
27
28
|
)
|
|
28
29
|
|
|
29
30
|
start_parser.add_argument(
|
|
30
|
-
"--ai",
|
|
31
|
+
"--ai",
|
|
32
|
+
default=False,
|
|
33
|
+
help="Make the entire process of loading, processing, reprocessing with AI and exporting with the default configuration.",
|
|
34
|
+
action="store_true",
|
|
31
35
|
)
|
|
32
36
|
|
|
33
|
-
ai_process_parser = subparsers.add_parser(
|
|
34
|
-
"ai", help="Process with AI."
|
|
35
|
-
)
|
|
37
|
+
ai_process_parser = subparsers.add_parser("ai", help="Process with AI.")
|
|
36
38
|
ai_process_parser.add_argument(
|
|
37
39
|
"--history", default=False, help="Reprocess ai history", action="store_true"
|
|
38
40
|
)
|
|
@@ -51,12 +53,13 @@ def main():
|
|
|
51
53
|
"--recursive", default=False, help="Run in recursive mode", action="store_true"
|
|
52
54
|
)
|
|
53
55
|
|
|
54
|
-
load_txt_parser = subparsers.add_parser("load", help="Load txt file")
|
|
56
|
+
load_txt_parser = subparsers.add_parser("load", help="📦 Load txt file")
|
|
57
|
+
load_txt_parser.add_argument("-file", default=None, help="File path.")
|
|
55
58
|
load_txt_parser.add_argument(
|
|
56
|
-
"
|
|
59
|
+
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
57
60
|
)
|
|
58
61
|
|
|
59
|
-
scrap_urls_parser = subparsers.add_parser("scrap-urls", help="Scrap urls")
|
|
62
|
+
scrap_urls_parser = subparsers.add_parser("scrap-urls", help="🐶 Scrap urls")
|
|
60
63
|
scrap_urls_parser.add_argument(
|
|
61
64
|
"--recursive", default=False, help="Run in recursive mode", action="store_true"
|
|
62
65
|
)
|
|
@@ -69,8 +72,11 @@ def main():
|
|
|
69
72
|
scrap_urls_parser.add_argument(
|
|
70
73
|
"--only-parents", default=False, help="Only parents urls", action="store_true"
|
|
71
74
|
)
|
|
75
|
+
scrap_urls_parser.add_argument(
|
|
76
|
+
"--verbose", default=False, help="Run in verbose mode", action="store_true"
|
|
77
|
+
)
|
|
72
78
|
|
|
73
|
-
sniff_url_parser = subparsers.add_parser("sniff-url", help="Check url")
|
|
79
|
+
sniff_url_parser = subparsers.add_parser("sniff-url", help="🐕 Sniff/Check url")
|
|
74
80
|
sniff_url_parser.add_argument(
|
|
75
81
|
"url", default="https://cesarcardoso.cc/", help="Url to sniff"
|
|
76
82
|
)
|
|
@@ -82,7 +88,7 @@ def main():
|
|
|
82
88
|
show_urls_parser.add_argument("--limit", default=0, help="Limit of lines to show")
|
|
83
89
|
show_urls_parser.add_argument("-url", default="", help="Url to show")
|
|
84
90
|
|
|
85
|
-
export_parser = subparsers.add_parser("export", help="Export urls to csv.")
|
|
91
|
+
export_parser = subparsers.add_parser("export", help="📊🖋️ Export urls to csv.")
|
|
86
92
|
export_parser.add_argument("--limit", default=0, help="Limit of lines to export")
|
|
87
93
|
export_parser.add_argument(
|
|
88
94
|
"--file",
|
|
@@ -96,14 +102,11 @@ def main():
|
|
|
96
102
|
action="store_true",
|
|
97
103
|
)
|
|
98
104
|
|
|
99
|
-
report_parser = subparsers.add_parser(
|
|
105
|
+
report_parser = subparsers.add_parser(
|
|
106
|
+
"report", help="📜🖋️ Export urls report to csv."
|
|
107
|
+
)
|
|
100
108
|
merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
|
|
101
109
|
|
|
102
|
-
# TODO: What is that?
|
|
103
|
-
# seed_parser.set_defaults(func=seed)
|
|
104
|
-
# classify_urls_parser.set_defaults(func=classify_urls)
|
|
105
|
-
# load_txt_parser.set_defaults(func=load_txt)
|
|
106
|
-
|
|
107
110
|
args = parser.parse_args()
|
|
108
111
|
|
|
109
112
|
if args.command == "classify-urls":
|
|
@@ -111,7 +114,7 @@ def main():
|
|
|
111
114
|
return
|
|
112
115
|
|
|
113
116
|
if args.command == "load":
|
|
114
|
-
load_txt(args.file)
|
|
117
|
+
load_txt(file_name=args.file, verbose=args.verbose)
|
|
115
118
|
return
|
|
116
119
|
|
|
117
120
|
if args.command == "seed":
|
|
@@ -132,6 +135,7 @@ def main():
|
|
|
132
135
|
ignore_valid_prefix=args.ignore_type,
|
|
133
136
|
randomize=args.randomize,
|
|
134
137
|
only_parents=args.only_parents,
|
|
138
|
+
verbose=args.verbose,
|
|
135
139
|
)
|
|
136
140
|
return
|
|
137
141
|
|
|
@@ -166,7 +170,12 @@ def main():
|
|
|
166
170
|
|
|
167
171
|
if args.command == "start":
|
|
168
172
|
load_txt()
|
|
169
|
-
scrap_urls(
|
|
173
|
+
scrap_urls(
|
|
174
|
+
recursive=True,
|
|
175
|
+
ignore_valid_prefix=True,
|
|
176
|
+
randomize=False,
|
|
177
|
+
only_parents=False,
|
|
178
|
+
)
|
|
170
179
|
if args.ai:
|
|
171
180
|
process_with_ai()
|
|
172
181
|
export_urls()
|
|
@@ -17,14 +17,21 @@ def get_db_path():
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def get_db_connection():
|
|
20
|
+
if not os.path.exists(get_db_path()):
|
|
21
|
+
create_tables(sqlite3.connect(get_db_path()))
|
|
20
22
|
return sqlite3.connect(get_db_path())
|
|
21
23
|
|
|
22
24
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
+
def use_connection(func):
|
|
26
|
+
def provide_connection(*args, **kwargs):
|
|
27
|
+
global conn
|
|
28
|
+
with get_db_connection() as conn:
|
|
29
|
+
return func(*args, **kwargs)
|
|
25
30
|
|
|
31
|
+
return provide_connection
|
|
26
32
|
|
|
27
|
-
|
|
33
|
+
|
|
34
|
+
def create_tables(conn):
|
|
28
35
|
|
|
29
36
|
c = conn.cursor()
|
|
30
37
|
c.execute(
|
|
@@ -38,27 +45,19 @@ def create_tables():
|
|
|
38
45
|
"CREATE TABLE IF NOT EXISTS urls_valid_prefix (id INTEGER PRIMARY KEY, url_prefix TEXT UNIQUE, url_type TEXT)"
|
|
39
46
|
)
|
|
40
47
|
|
|
41
|
-
return pd.read_sql_query("SELECT * FROM urls LIMIT 100", conn)
|
|
42
|
-
|
|
43
48
|
|
|
44
|
-
# TODO: not sure this should be something. depends on the project
|
|
45
49
|
def seeds():
|
|
46
|
-
create_tables()
|
|
47
|
-
|
|
48
50
|
add_urls_valid_prefix("https://%.linkedin.com/posts/%", "linkedin_post")
|
|
49
51
|
add_urls_valid_prefix("https://lnkd.in/%", "linkedin_redirect")
|
|
50
52
|
add_urls_valid_prefix("https://%.linkedin.com/jobs/view/%", "linkedin_job")
|
|
51
53
|
add_urls_valid_prefix("https://%.linkedin.com/feed/%", "linkedin_feed")
|
|
52
54
|
add_urls_valid_prefix("https://%.linkedin.com/company/%", "linkedin_company")
|
|
53
55
|
|
|
54
|
-
# add_urls_valid_prefix("%.pdf", "pdf")
|
|
55
|
-
# add_url('https://imazon.org.br/categorias/artigos-cientificos/')
|
|
56
|
-
|
|
57
56
|
return True
|
|
58
57
|
|
|
59
58
|
|
|
59
|
+
@use_connection
|
|
60
60
|
def add_urls_valid_prefix(url_prefix, url_type):
|
|
61
|
-
conn = get_db_connection()
|
|
62
61
|
|
|
63
62
|
df = pd.read_sql_query(
|
|
64
63
|
f"SELECT * FROM urls_valid_prefix WHERE url_prefix = '{url_prefix}'", conn
|
|
@@ -72,6 +71,7 @@ def add_urls_valid_prefix(url_prefix, url_type):
|
|
|
72
71
|
conn.commit()
|
|
73
72
|
|
|
74
73
|
|
|
74
|
+
@use_connection
|
|
75
75
|
def get_urls_valid_prefix_by_type(url_type):
|
|
76
76
|
df = pd.read_sql_query(
|
|
77
77
|
f"SELECT * FROM urls_valid_prefix WHERE url_type = '{url_type}'", conn
|
|
@@ -79,12 +79,14 @@ def get_urls_valid_prefix_by_type(url_type):
|
|
|
79
79
|
return df
|
|
80
80
|
|
|
81
81
|
|
|
82
|
+
@use_connection
|
|
82
83
|
def get_urls_valid_prefix_by_id(id):
|
|
83
84
|
df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix WHERE id = '{id}'", conn)
|
|
84
85
|
return df
|
|
85
86
|
|
|
86
87
|
|
|
87
88
|
# TODO: pagination required
|
|
89
|
+
@use_connection
|
|
88
90
|
def get_urls_valid_prefix(limit=0):
|
|
89
91
|
if limit > 0:
|
|
90
92
|
df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix LIMIT {limit}", conn)
|
|
@@ -94,6 +96,7 @@ def get_urls_valid_prefix(limit=0):
|
|
|
94
96
|
|
|
95
97
|
|
|
96
98
|
# TODO: pagination required
|
|
99
|
+
@use_connection
|
|
97
100
|
def get_urls(limit=0):
|
|
98
101
|
if limit > 0:
|
|
99
102
|
df = pd.read_sql_query(
|
|
@@ -104,6 +107,7 @@ def get_urls(limit=0):
|
|
|
104
107
|
return df
|
|
105
108
|
|
|
106
109
|
|
|
110
|
+
@use_connection
|
|
107
111
|
def get_urls_report():
|
|
108
112
|
sql = """
|
|
109
113
|
WITH parent_url AS (
|
|
@@ -138,6 +142,7 @@ def get_urls_report():
|
|
|
138
142
|
return df
|
|
139
143
|
|
|
140
144
|
|
|
145
|
+
@use_connection
|
|
141
146
|
def get_url_by_url(url):
|
|
142
147
|
url = clean_url(url)
|
|
143
148
|
df = pd.read_sql_query(f"SELECT * FROM urls WHERE url = '{url}'", conn)
|
|
@@ -145,12 +150,14 @@ def get_url_by_url(url):
|
|
|
145
150
|
return df
|
|
146
151
|
|
|
147
152
|
|
|
153
|
+
@use_connection
|
|
148
154
|
def get_url_by_id(id):
|
|
149
155
|
df = pd.read_sql_query(f"SELECT * FROM urls WHERE id = '{id}'", conn)
|
|
150
156
|
|
|
151
157
|
return df
|
|
152
158
|
|
|
153
159
|
|
|
160
|
+
@use_connection
|
|
154
161
|
def get_urls_by_url_type(url_type):
|
|
155
162
|
df = pd.read_sql_query(
|
|
156
163
|
f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}'", conn
|
|
@@ -158,6 +165,7 @@ def get_urls_by_url_type(url_type):
|
|
|
158
165
|
return df
|
|
159
166
|
|
|
160
167
|
|
|
168
|
+
@use_connection
|
|
161
169
|
def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
|
|
162
170
|
df = pd.read_sql_query(
|
|
163
171
|
f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}' AND ai_processed = 0 LIMIT {limit}",
|
|
@@ -166,6 +174,7 @@ def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
|
|
|
166
174
|
return df
|
|
167
175
|
|
|
168
176
|
|
|
177
|
+
@use_connection
|
|
169
178
|
def get_url_like_unclassified(like_condition):
|
|
170
179
|
df = pd.read_sql_query(
|
|
171
180
|
f"SELECT * FROM urls WHERE history = 0 AND url LIKE '{like_condition}' AND url_type IS NULL",
|
|
@@ -174,6 +183,7 @@ def get_url_like_unclassified(like_condition):
|
|
|
174
183
|
return df
|
|
175
184
|
|
|
176
185
|
|
|
186
|
+
@use_connection
|
|
177
187
|
def add_url(url, h1=None, parent_url=None):
|
|
178
188
|
url = clean_url(url)
|
|
179
189
|
c = conn.cursor()
|
|
@@ -196,6 +206,7 @@ def add_url(url, h1=None, parent_url=None):
|
|
|
196
206
|
return get_url_by_url(url)
|
|
197
207
|
|
|
198
208
|
|
|
209
|
+
@use_connection
|
|
199
210
|
def add_ai_log(instructions, response, model, prompt_file, prompt_name):
|
|
200
211
|
c = conn.cursor()
|
|
201
212
|
|
|
@@ -205,10 +216,14 @@ def add_ai_log(instructions, response, model, prompt_file, prompt_name):
|
|
|
205
216
|
)
|
|
206
217
|
conn.commit()
|
|
207
218
|
|
|
219
|
+
|
|
220
|
+
@use_connection
|
|
208
221
|
def get_ai_log():
|
|
209
222
|
df = pd.read_sql_query(f"SELECT * FROM ai_log", conn)
|
|
210
223
|
return df
|
|
211
224
|
|
|
225
|
+
|
|
226
|
+
@use_connection
|
|
212
227
|
def set_url_destiny(url, destiny):
|
|
213
228
|
url = clean_url(url)
|
|
214
229
|
destiny = clean_url(destiny)
|
|
@@ -222,6 +237,7 @@ def set_url_destiny(url, destiny):
|
|
|
222
237
|
conn.commit()
|
|
223
238
|
|
|
224
239
|
|
|
240
|
+
@use_connection
|
|
225
241
|
def set_url_h1(url, value):
|
|
226
242
|
value = str(value).strip()
|
|
227
243
|
url = clean_url(url)
|
|
@@ -230,6 +246,7 @@ def set_url_h1(url, value):
|
|
|
230
246
|
conn.commit()
|
|
231
247
|
|
|
232
248
|
|
|
249
|
+
@use_connection
|
|
233
250
|
def set_url_h1_by_id(id, value):
|
|
234
251
|
value = str(value).strip()
|
|
235
252
|
|
|
@@ -238,29 +255,44 @@ def set_url_h1_by_id(id, value):
|
|
|
238
255
|
conn.commit()
|
|
239
256
|
|
|
240
257
|
|
|
258
|
+
@use_connection
|
|
241
259
|
def set_url_ai_processed_by_id(id, json_str):
|
|
242
260
|
value = 1
|
|
243
261
|
value = str(value).strip()
|
|
244
262
|
c = conn.cursor()
|
|
245
|
-
c.execute(
|
|
263
|
+
c.execute(
|
|
264
|
+
"UPDATE urls SET ai_processed = ? , json_ai = ? WHERE id = ?",
|
|
265
|
+
(value, json_str, id),
|
|
266
|
+
)
|
|
246
267
|
conn.commit()
|
|
247
268
|
|
|
269
|
+
|
|
270
|
+
@use_connection
|
|
248
271
|
def set_url_empty_ai_processed_by_id(id, json_str="empty result"):
|
|
249
272
|
value = 1
|
|
250
273
|
value = str(value).strip()
|
|
251
274
|
c = conn.cursor()
|
|
252
|
-
c.execute(
|
|
275
|
+
c.execute(
|
|
276
|
+
"UPDATE urls SET ai_processed = ? , json_ai = ? WHERE ai_processed = 0 AND id = ?",
|
|
277
|
+
(value, json_str, id),
|
|
278
|
+
)
|
|
253
279
|
conn.commit()
|
|
254
280
|
|
|
281
|
+
|
|
282
|
+
@use_connection
|
|
255
283
|
def set_url_ai_processed_by_url(url, json_str):
|
|
256
284
|
value = 1
|
|
257
285
|
value = str(value).strip()
|
|
258
286
|
url = clean_url(url)
|
|
259
287
|
c = conn.cursor()
|
|
260
|
-
c.execute(
|
|
288
|
+
c.execute(
|
|
289
|
+
"UPDATE urls SET ai_processed = ?, json_ai = ? WHERE url = ?",
|
|
290
|
+
(value, json_str, url),
|
|
291
|
+
)
|
|
261
292
|
conn.commit()
|
|
262
293
|
|
|
263
294
|
|
|
295
|
+
@use_connection
|
|
264
296
|
def set_url_description(url, value):
|
|
265
297
|
url = clean_url(url)
|
|
266
298
|
c = conn.cursor()
|
|
@@ -268,6 +300,7 @@ def set_url_description(url, value):
|
|
|
268
300
|
conn.commit()
|
|
269
301
|
|
|
270
302
|
|
|
303
|
+
@use_connection
|
|
271
304
|
def set_url_description_links(url, value):
|
|
272
305
|
url = clean_url(url)
|
|
273
306
|
c = conn.cursor()
|
|
@@ -275,6 +308,7 @@ def set_url_description_links(url, value):
|
|
|
275
308
|
conn.commit()
|
|
276
309
|
|
|
277
310
|
|
|
311
|
+
@use_connection
|
|
278
312
|
def set_url_json(url, value):
|
|
279
313
|
url = clean_url(url)
|
|
280
314
|
c = conn.cursor()
|
|
@@ -282,6 +316,7 @@ def set_url_json(url, value):
|
|
|
282
316
|
conn.commit()
|
|
283
317
|
|
|
284
318
|
|
|
319
|
+
@use_connection
|
|
285
320
|
def set_url_error(url, value):
|
|
286
321
|
url = clean_url(url)
|
|
287
322
|
c = conn.cursor()
|
|
@@ -289,6 +324,7 @@ def set_url_error(url, value):
|
|
|
289
324
|
conn.commit()
|
|
290
325
|
|
|
291
326
|
|
|
327
|
+
@use_connection
|
|
292
328
|
def set_url_type_by_id(url_id, url_type):
|
|
293
329
|
c = conn.cursor()
|
|
294
330
|
c.execute(f"UPDATE urls SET url_type = '{url_type}' WHERE id = {url_id}")
|
|
@@ -312,6 +348,7 @@ def clean_url(url):
|
|
|
312
348
|
return url
|
|
313
349
|
|
|
314
350
|
|
|
351
|
+
@use_connection
|
|
315
352
|
def get_untouched_urls(
|
|
316
353
|
limit=10, randomize=True, ignore_valid_prefix=False, only_parents=True
|
|
317
354
|
):
|
|
@@ -331,6 +368,7 @@ def get_untouched_urls(
|
|
|
331
368
|
return df
|
|
332
369
|
|
|
333
370
|
|
|
371
|
+
@use_connection
|
|
334
372
|
def touch_url(url):
|
|
335
373
|
url = clean_url(url)
|
|
336
374
|
c = conn.cursor()
|
|
@@ -338,6 +376,7 @@ def touch_url(url):
|
|
|
338
376
|
conn.commit()
|
|
339
377
|
|
|
340
378
|
|
|
379
|
+
@use_connection
|
|
341
380
|
def untouch_url(url):
|
|
342
381
|
url = clean_url(url)
|
|
343
382
|
c = conn.cursor()
|
|
@@ -345,12 +384,14 @@ def untouch_url(url):
|
|
|
345
384
|
conn.commit()
|
|
346
385
|
|
|
347
386
|
|
|
387
|
+
@use_connection
|
|
348
388
|
def untouch_all_urls():
|
|
349
389
|
c = conn.cursor()
|
|
350
390
|
c.execute("UPDATE urls SET last_touch = NULL WHERE history = 0")
|
|
351
391
|
conn.commit()
|
|
352
392
|
|
|
353
393
|
|
|
394
|
+
@use_connection
|
|
354
395
|
def set_all_urls_as_history():
|
|
355
396
|
c = conn.cursor()
|
|
356
397
|
c.execute("UPDATE urls SET history = 1")
|
|
@@ -382,9 +423,9 @@ def merge_dbs() -> None:
|
|
|
382
423
|
row["description"],
|
|
383
424
|
row["json"],
|
|
384
425
|
)
|
|
385
|
-
# ßmerge_url(df)
|
|
386
426
|
|
|
387
427
|
|
|
428
|
+
@use_connection
|
|
388
429
|
def merge_url(url, h1, last_touch, created_at, description, json):
|
|
389
430
|
url = clean_url(url)
|
|
390
431
|
c = conn.cursor()
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def classify_urls(recursive=False):
|
|
7
|
+
urls_manager.seeds()
|
|
8
|
+
df = urls_manager.get_urls_valid_prefix()
|
|
9
|
+
|
|
10
|
+
keep_alive = True
|
|
11
|
+
while keep_alive:
|
|
12
|
+
print("#️⃣ URL Classifier woke up to classify urls!")
|
|
13
|
+
for index, row_prefix in df.iterrows():
|
|
14
|
+
df_urls = urls_manager.get_url_like_unclassified(
|
|
15
|
+
like_condition=row_prefix["url_prefix"]
|
|
16
|
+
)
|
|
17
|
+
for index, row_urls in df_urls.iterrows():
|
|
18
|
+
urls_manager.set_url_type_by_id(
|
|
19
|
+
url_id=row_urls["id"], url_type=row_prefix["url_type"]
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
if not recursive:
|
|
23
|
+
print("#️⃣ URL Classifier said: I'm done! See you soon...")
|
|
24
|
+
keep_alive = False
|
|
25
|
+
else:
|
|
26
|
+
print("#️⃣ URL Classifier is taking a nap...")
|
|
27
|
+
time.sleep(10)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from urlextract import URLExtract
|
|
3
|
+
import ohmyscrapper.models.urls_manager as urls_manager
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _increment_file_name(text_file_content, file_name):
|
|
7
|
+
print(f"reading and loading file `{file_name}`... ")
|
|
8
|
+
with open(file_name, "r") as f:
|
|
9
|
+
return text_file_content + f.read()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load_txt(file_name=None, verbose=False):
|
|
13
|
+
if not os.path.exists("db"):
|
|
14
|
+
os.mkdir("db")
|
|
15
|
+
|
|
16
|
+
if not os.path.exists("input"):
|
|
17
|
+
os.mkdir("input")
|
|
18
|
+
|
|
19
|
+
urls_manager.seeds()
|
|
20
|
+
|
|
21
|
+
text_file_content = ""
|
|
22
|
+
if file_name is not None:
|
|
23
|
+
print(f"📖 reading file `{file_name}`... ")
|
|
24
|
+
if not os.path.exists(file_name):
|
|
25
|
+
print(f"\n file `{file_name}` not found.")
|
|
26
|
+
return
|
|
27
|
+
text_file_content = _increment_file_name(
|
|
28
|
+
text_file_content=text_file_content, file_name=file_name
|
|
29
|
+
)
|
|
30
|
+
else:
|
|
31
|
+
print("📂 reading /input directory... ")
|
|
32
|
+
dir_files = "input"
|
|
33
|
+
text_files = os.listdir(dir_files)
|
|
34
|
+
for file in text_files:
|
|
35
|
+
if not file.endswith(".txt"):
|
|
36
|
+
text_files.remove(file)
|
|
37
|
+
if len(text_files) == 0:
|
|
38
|
+
print("No text files found in /input directory!")
|
|
39
|
+
return
|
|
40
|
+
elif len(text_files) == 1:
|
|
41
|
+
print(f"📖 reading file `{dir_files}/{text_files[0]}`... ")
|
|
42
|
+
text_file_content = _increment_file_name(
|
|
43
|
+
text_file_content=text_file_content,
|
|
44
|
+
file_name=dir_files + "/" + text_files[0],
|
|
45
|
+
)
|
|
46
|
+
else:
|
|
47
|
+
print("\nChoose a text file. Use `*` for process all and `q` to quit:")
|
|
48
|
+
for index, file in enumerate(text_files):
|
|
49
|
+
print(f"[{index}]:", dir_files + "/" + file)
|
|
50
|
+
|
|
51
|
+
# TODO: there is a better way for sure!
|
|
52
|
+
text_file_option = -1
|
|
53
|
+
while text_file_option < 0 or text_file_option >= len(text_files):
|
|
54
|
+
text_file_option = input("Enter the file number: ")
|
|
55
|
+
if text_file_option == "*":
|
|
56
|
+
for file in text_files:
|
|
57
|
+
text_file_content = _increment_file_name(
|
|
58
|
+
text_file_content=text_file_content,
|
|
59
|
+
file_name=dir_files + "/" + file,
|
|
60
|
+
)
|
|
61
|
+
text_file_option = 0
|
|
62
|
+
elif text_file_option == "q":
|
|
63
|
+
return
|
|
64
|
+
elif text_file_option.isdigit():
|
|
65
|
+
text_file_option = int(text_file_option)
|
|
66
|
+
if text_file_option >= 0 and text_file_option < len(text_files):
|
|
67
|
+
text_file_content = _increment_file_name(
|
|
68
|
+
text_file_content=text_file_content,
|
|
69
|
+
file_name=dir_files
|
|
70
|
+
+ "/"
|
|
71
|
+
+ text_files[int(text_file_option)],
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
print("🔎 looking for urls...")
|
|
75
|
+
urls_found = put_urls_from_string(
|
|
76
|
+
text_to_process=text_file_content, verbose=verbose
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
print("--------------------")
|
|
80
|
+
print("files processed")
|
|
81
|
+
print(f"📦 {urls_found} urls were extracted and packed into the database")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def put_urls_from_string(text_to_process, parent_url=None, verbose=False):
|
|
85
|
+
if isinstance(text_to_process, str):
|
|
86
|
+
extractor = URLExtract()
|
|
87
|
+
for url in extractor.find_urls(text_to_process):
|
|
88
|
+
urls_manager.add_url(url=url, parent_url=parent_url)
|
|
89
|
+
if verbose:
|
|
90
|
+
print(url, "added")
|
|
91
|
+
|
|
92
|
+
return len(extractor.find_urls(text_to_process))
|
|
93
|
+
else:
|
|
94
|
+
return 0
|
|
@@ -7,9 +7,11 @@ import time
|
|
|
7
7
|
import os
|
|
8
8
|
import yaml
|
|
9
9
|
import json
|
|
10
|
+
|
|
10
11
|
# TODO: !!! REFACTOR !!!
|
|
11
12
|
load_dotenv()
|
|
12
13
|
|
|
14
|
+
|
|
13
15
|
def reprocess_ai_history():
|
|
14
16
|
df = urls_manager.get_ai_log().to_dict(orient="records")
|
|
15
17
|
for row in df:
|
|
@@ -17,28 +19,34 @@ def reprocess_ai_history():
|
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
def process_ai_response(response):
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
22
|
+
job_positions = xml2dict(response)
|
|
23
|
+
|
|
24
|
+
for index, xml_item_children in job_positions.items():
|
|
25
|
+
for url_child_xml in xml_item_children:
|
|
26
|
+
|
|
27
|
+
url_parent = urls_manager.get_url_by_id(url_child_xml["id"])
|
|
28
|
+
if len(url_parent) > 0:
|
|
29
|
+
url_parent = url_parent.iloc[0]
|
|
30
|
+
h1 = url_child_xml.copy()
|
|
31
|
+
del h1["id"]
|
|
32
|
+
del h1["url"]
|
|
33
|
+
h1 = " - ".join(h1.values())
|
|
34
|
+
if url_parent["description_links"] > 1 and url_child_xml["id"] != "":
|
|
35
|
+
print("-- child updated -- \n", url_child_xml["url"], ":", h1)
|
|
36
|
+
urls_manager.set_url_h1(url_child_xml["url"], h1)
|
|
37
|
+
urls_manager.set_url_ai_processed_by_url(
|
|
38
|
+
url_child_xml["url"], str(json.dumps(url_child_xml))
|
|
39
|
+
)
|
|
40
|
+
if url_parent["url"] != url_child_xml["url"]:
|
|
41
|
+
urls_manager.set_url_ai_processed_by_url(
|
|
42
|
+
url_parent["url"], "children-update"
|
|
43
|
+
)
|
|
44
|
+
else:
|
|
45
|
+
print("-- parent updated -- \n", url_parent["url"], ":", h1)
|
|
46
|
+
urls_manager.set_url_h1(url_parent["url"], h1)
|
|
47
|
+
urls_manager.set_url_ai_processed_by_url(
|
|
48
|
+
url_parent["url"], str(json.dumps(url_child_xml))
|
|
49
|
+
)
|
|
42
50
|
|
|
43
51
|
|
|
44
52
|
def xml2dict(xml_string):
|
|
@@ -46,19 +54,21 @@ def xml2dict(xml_string):
|
|
|
46
54
|
|
|
47
55
|
children_items_dict = {}
|
|
48
56
|
for item in soup.find_all():
|
|
49
|
-
if
|
|
57
|
+
if item.parent.name == "[document]":
|
|
50
58
|
children_items_dict[item.name] = []
|
|
51
59
|
elif item.parent.name in children_items_dict:
|
|
52
60
|
children_items_dict[item.parent.name].append(_xml_children_to_dict(item))
|
|
53
61
|
|
|
54
62
|
return children_items_dict
|
|
55
63
|
|
|
64
|
+
|
|
56
65
|
def _xml_children_to_dict(xml):
|
|
57
66
|
item_dict = {}
|
|
58
67
|
for item in xml.find_all():
|
|
59
68
|
item_dict[item.name] = item.text
|
|
60
69
|
return item_dict
|
|
61
70
|
|
|
71
|
+
|
|
62
72
|
def process_with_ai(recursive=True, triggered_times=0):
|
|
63
73
|
triggered_times = triggered_times + 1
|
|
64
74
|
|
|
@@ -91,13 +101,23 @@ def process_with_ai(recursive=True, triggered_times=0):
|
|
|
91
101
|
print("prompt:", prompt["name"])
|
|
92
102
|
print("model:", prompt["model"])
|
|
93
103
|
print("description:", prompt["description"])
|
|
94
|
-
prompt["instructions"] = prompt["instructions"].replace(
|
|
104
|
+
prompt["instructions"] = prompt["instructions"].replace(
|
|
105
|
+
"{ohmyscrapper_texts}", texts
|
|
106
|
+
)
|
|
95
107
|
|
|
96
108
|
# The client gets the API key from the environment variable `GEMINI_API_KEY`.
|
|
97
109
|
client = genai.Client()
|
|
98
|
-
response = client.models.generate_content(
|
|
110
|
+
response = client.models.generate_content(
|
|
111
|
+
model=prompt["model"], contents=prompt["instructions"]
|
|
112
|
+
)
|
|
99
113
|
response = str(response.text)
|
|
100
|
-
urls_manager.add_ai_log(
|
|
114
|
+
urls_manager.add_ai_log(
|
|
115
|
+
instructions=prompt["instructions"],
|
|
116
|
+
response=response,
|
|
117
|
+
model=prompt["model"],
|
|
118
|
+
prompt_name=prompt["name"],
|
|
119
|
+
prompt_file=prompt["prompt_file"],
|
|
120
|
+
)
|
|
101
121
|
print(response)
|
|
102
122
|
print("^^^^^^")
|
|
103
123
|
process_ai_response(response=response)
|
|
@@ -114,7 +134,9 @@ def process_with_ai(recursive=True, triggered_times=0):
|
|
|
114
134
|
if triggered_times > 5:
|
|
115
135
|
print("!!! This is a break to prevent budget accident$.")
|
|
116
136
|
print("You triggered", triggered_times, "times the AI processing function.")
|
|
117
|
-
print(
|
|
137
|
+
print(
|
|
138
|
+
"If you are sure this is correct, you can re-call this function again."
|
|
139
|
+
)
|
|
118
140
|
print("Please, check it.")
|
|
119
141
|
return
|
|
120
142
|
|
|
@@ -122,6 +144,7 @@ def process_with_ai(recursive=True, triggered_times=0):
|
|
|
122
144
|
|
|
123
145
|
return
|
|
124
146
|
|
|
147
|
+
|
|
125
148
|
def _get_prompt():
|
|
126
149
|
prompts_path = "prompts"
|
|
127
150
|
default_prompt = """---
|
|
@@ -135,13 +158,17 @@ Process with AI this prompt: {ohmyscrapper_texts}
|
|
|
135
158
|
os.mkdir(prompts_path)
|
|
136
159
|
|
|
137
160
|
open(f"{prompts_path}/prompt.md", "w").write(default_prompt)
|
|
138
|
-
print(
|
|
161
|
+
print(
|
|
162
|
+
f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there."
|
|
163
|
+
)
|
|
139
164
|
return False
|
|
140
165
|
|
|
141
166
|
prompt_files = os.listdir(prompts_path)
|
|
142
167
|
if len(prompt_files) == 0:
|
|
143
168
|
open(f"{prompts_path}/prompt.md", "w").write(default_prompt)
|
|
144
|
-
print(
|
|
169
|
+
print(
|
|
170
|
+
f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there."
|
|
171
|
+
)
|
|
145
172
|
return False
|
|
146
173
|
prompt = {}
|
|
147
174
|
if len(prompt_files) == 1:
|
|
@@ -151,8 +178,10 @@ Process with AI this prompt: {ohmyscrapper_texts}
|
|
|
151
178
|
prompts = {}
|
|
152
179
|
for index, file in enumerate(prompt_files):
|
|
153
180
|
prompts[index] = _parse_prompt(prompts_path=prompts_path, prompt_file=file)
|
|
154
|
-
print(index, ":", prompts[index][
|
|
155
|
-
input_prompt = input(
|
|
181
|
+
print(index, ":", prompts[index]["name"])
|
|
182
|
+
input_prompt = input(
|
|
183
|
+
"Type the number of the prompt you want to use or 'q' to quit: "
|
|
184
|
+
)
|
|
156
185
|
if input_prompt == "q":
|
|
157
186
|
return False
|
|
158
187
|
try:
|
|
@@ -162,6 +191,7 @@ Process with AI this prompt: {ohmyscrapper_texts}
|
|
|
162
191
|
prompt = _get_prompt()
|
|
163
192
|
return prompt
|
|
164
193
|
|
|
194
|
+
|
|
165
195
|
def _parse_prompt(prompts_path, prompt_file):
|
|
166
196
|
prompt = {}
|
|
167
197
|
raw_prompt = open(f"{prompts_path}/{prompt_file}", "r").read().split("---")
|
|
@@ -170,6 +200,8 @@ def _parse_prompt(prompts_path, prompt_file):
|
|
|
170
200
|
prompt["prompt_file"] = prompt_file
|
|
171
201
|
|
|
172
202
|
return prompt
|
|
203
|
+
|
|
204
|
+
|
|
173
205
|
# TODO: Separate gemini from basic function
|
|
174
206
|
def _process_with_gemini(model, instructions):
|
|
175
207
|
response = """"""
|
|
@@ -7,72 +7,87 @@ import time
|
|
|
7
7
|
import random
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
def process_linkedin_redirect(url_report, url):
|
|
11
|
-
|
|
10
|
+
def process_linkedin_redirect(url_report, url, verbose=False):
|
|
11
|
+
if verbose:
|
|
12
|
+
print("linkedin_redirect")
|
|
12
13
|
|
|
13
14
|
if url_report["total-a-links"] < 5:
|
|
14
15
|
if "first-a-link" in url_report.keys():
|
|
15
16
|
url_destiny = url_report["first-a-link"]
|
|
16
17
|
else:
|
|
17
18
|
urls_manager.set_url_error(url=url["url"], value="error: no first-a-link")
|
|
18
|
-
|
|
19
|
+
if verbose:
|
|
20
|
+
print("no url for:", url["url"])
|
|
19
21
|
return
|
|
20
22
|
else:
|
|
21
23
|
if "og:url" in url_report.keys():
|
|
22
24
|
url_destiny = url_report["og:url"]
|
|
23
25
|
else:
|
|
24
26
|
urls_manager.set_url_error(url=url["url"], value="error: no og:url")
|
|
25
|
-
|
|
27
|
+
if verbose:
|
|
28
|
+
print("no url for:", url["url"])
|
|
26
29
|
return
|
|
27
|
-
|
|
28
|
-
|
|
30
|
+
if verbose:
|
|
31
|
+
print(url["url"], ">>", url_destiny)
|
|
29
32
|
urls_manager.add_url(url=url_destiny)
|
|
30
33
|
urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
|
|
31
34
|
|
|
32
35
|
|
|
33
|
-
def process_linkedin_feed(url_report, url):
|
|
34
|
-
|
|
36
|
+
def process_linkedin_feed(url_report, url, verbose=False):
|
|
37
|
+
if verbose:
|
|
38
|
+
print("linkedin_feed")
|
|
35
39
|
|
|
36
40
|
if "og:url" in url_report.keys():
|
|
37
41
|
url_destiny = url_report["og:url"]
|
|
38
42
|
else:
|
|
39
43
|
urls_manager.set_url_error(url=url["url"], value="error: no og:url")
|
|
40
|
-
|
|
44
|
+
if verbose:
|
|
45
|
+
print("no url for:", url["url"])
|
|
41
46
|
return
|
|
42
47
|
|
|
43
|
-
|
|
48
|
+
if verbose:
|
|
49
|
+
print(url["url"], ">>", url_destiny)
|
|
44
50
|
urls_manager.add_url(url=url_destiny)
|
|
45
51
|
urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
|
|
46
52
|
|
|
47
53
|
|
|
48
|
-
def process_linkedin_job(url_report, url):
|
|
49
|
-
|
|
54
|
+
def process_linkedin_job(url_report, url, verbose=False):
|
|
55
|
+
if verbose:
|
|
56
|
+
print("linkedin_job")
|
|
50
57
|
changed = False
|
|
51
58
|
if "h1" in url_report.keys():
|
|
52
|
-
|
|
59
|
+
if verbose:
|
|
60
|
+
print(url["url"], ": ", url_report["h1"])
|
|
53
61
|
urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
|
|
54
62
|
changed = True
|
|
55
63
|
elif "og:title" in url_report.keys():
|
|
56
|
-
|
|
64
|
+
if verbose:
|
|
65
|
+
print(url["url"], ": ", url_report["og:title"])
|
|
57
66
|
urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
|
|
58
67
|
changed = True
|
|
59
68
|
|
|
60
69
|
if "description" in url_report.keys():
|
|
61
|
-
urls_manager.set_url_description(
|
|
70
|
+
urls_manager.set_url_description(
|
|
71
|
+
url=url["url"], value=url_report["description"]
|
|
72
|
+
)
|
|
62
73
|
changed = True
|
|
63
74
|
elif "og:description" in url_report.keys():
|
|
64
|
-
urls_manager.set_url_description(
|
|
75
|
+
urls_manager.set_url_description(
|
|
76
|
+
url=url["url"], value=url_report["og:description"]
|
|
77
|
+
)
|
|
65
78
|
changed = True
|
|
66
79
|
if not changed:
|
|
67
80
|
urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
|
|
68
81
|
|
|
69
82
|
|
|
70
|
-
def process_linkedin_post(url_report, url):
|
|
71
|
-
|
|
72
|
-
|
|
83
|
+
def process_linkedin_post(url_report, url, verbose=False):
|
|
84
|
+
if verbose:
|
|
85
|
+
print("linkedin_post or generic")
|
|
86
|
+
print(url["url"])
|
|
73
87
|
changed = False
|
|
74
88
|
if "h1" in url_report.keys():
|
|
75
|
-
|
|
89
|
+
if verbose:
|
|
90
|
+
print(url["url"], ": ", url_report["h1"])
|
|
76
91
|
urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
|
|
77
92
|
changed = True
|
|
78
93
|
elif "og:title" in url_report.keys():
|
|
@@ -88,52 +103,50 @@ def process_linkedin_post(url_report, url):
|
|
|
88
103
|
|
|
89
104
|
if description is not None:
|
|
90
105
|
urls_manager.set_url_description(url=url["url"], value=description)
|
|
91
|
-
description_links = load_txt.put_urls_from_string(
|
|
106
|
+
description_links = load_txt.put_urls_from_string(
|
|
107
|
+
text_to_process=description, parent_url=url["url"]
|
|
108
|
+
)
|
|
92
109
|
urls_manager.set_url_description_links(url=url["url"], value=description_links)
|
|
93
110
|
|
|
94
111
|
if not changed:
|
|
95
112
|
urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
|
|
96
113
|
|
|
97
114
|
|
|
98
|
-
def scrap_url(url):
|
|
99
|
-
# TODO: Use get_urls_valid_prefix_by_id()
|
|
100
|
-
df = urls_manager.get_urls_valid_prefix()
|
|
101
|
-
|
|
115
|
+
def scrap_url(url, verbose=False):
|
|
102
116
|
# TODO: Need to change this
|
|
103
117
|
|
|
104
118
|
if url["url_type"] is None:
|
|
105
|
-
|
|
119
|
+
if verbose:
|
|
120
|
+
print("\n\ngeneric:", url["url"])
|
|
106
121
|
url["url_type"] = "generic"
|
|
107
122
|
else:
|
|
108
|
-
|
|
123
|
+
if verbose:
|
|
124
|
+
print("\n\n", url["url_type"] + ":", url["url"])
|
|
109
125
|
try:
|
|
110
126
|
url_report = sniff_url.get_tags(url=url["url"])
|
|
111
127
|
except Exception as e:
|
|
112
128
|
urls_manager.set_url_error(url=url["url"], value="error")
|
|
113
129
|
urls_manager.touch_url(url=url["url"])
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
130
|
+
if verbose:
|
|
131
|
+
print("\n\n!!! ERROR FOR:", url["url"])
|
|
132
|
+
print(
|
|
133
|
+
"\n\n!!! you can check the URL using the command sniff-url",
|
|
134
|
+
url["url"],
|
|
135
|
+
"\n\n",
|
|
136
|
+
)
|
|
120
137
|
return
|
|
121
138
|
|
|
122
|
-
# linkedin_redirect - linkedin (https://lnkd.in/)
|
|
123
139
|
if url["url_type"] == "linkedin_redirect":
|
|
124
|
-
process_linkedin_redirect(url_report=url_report, url=url)
|
|
140
|
+
process_linkedin_redirect(url_report=url_report, url=url, verbose=verbose)
|
|
125
141
|
|
|
126
|
-
# linkedin_feed - linkedin (https://%.linkedin.com/feed/)
|
|
127
142
|
if url["url_type"] == "linkedin_feed":
|
|
128
|
-
process_linkedin_feed(url_report=url_report, url=url)
|
|
143
|
+
process_linkedin_feed(url_report=url_report, url=url, verbose=verbose)
|
|
129
144
|
|
|
130
|
-
# linkedin_job - linkedin (https://www.linkedin.com/jobs/)
|
|
131
145
|
if url["url_type"] == "linkedin_job":
|
|
132
|
-
process_linkedin_job(url_report=url_report, url=url)
|
|
146
|
+
process_linkedin_job(url_report=url_report, url=url, verbose=verbose)
|
|
133
147
|
|
|
134
|
-
# linkedin_job - linkedin (https://www.linkedin.com/jobs/)
|
|
135
148
|
if url["url_type"] == "linkedin_post" or url["url_type"] == "generic":
|
|
136
|
-
process_linkedin_post(url_report=url_report, url=url)
|
|
149
|
+
process_linkedin_post(url_report=url_report, url=url, verbose=verbose)
|
|
137
150
|
|
|
138
151
|
urls_manager.set_url_json(url=url["url"], value=url_report["json"])
|
|
139
152
|
urls_manager.touch_url(url=url["url"])
|
|
@@ -144,35 +157,53 @@ def isNaN(num):
|
|
|
144
157
|
|
|
145
158
|
|
|
146
159
|
def scrap_urls(
|
|
147
|
-
recursive=False,
|
|
160
|
+
recursive=False,
|
|
161
|
+
ignore_valid_prefix=False,
|
|
162
|
+
randomize=False,
|
|
163
|
+
only_parents=True,
|
|
164
|
+
verbose=False,
|
|
165
|
+
n_urls=0,
|
|
148
166
|
):
|
|
167
|
+
limit = 10
|
|
149
168
|
classify_urls.classify_urls()
|
|
150
169
|
urls = urls_manager.get_untouched_urls(
|
|
151
170
|
ignore_valid_prefix=ignore_valid_prefix,
|
|
152
171
|
randomize=randomize,
|
|
153
172
|
only_parents=only_parents,
|
|
173
|
+
limit=limit,
|
|
154
174
|
)
|
|
155
175
|
if len(urls) == 0:
|
|
156
|
-
print("no urls to scrap")
|
|
176
|
+
print("📭 no urls to scrap")
|
|
177
|
+
if n_urls > 0:
|
|
178
|
+
print(f"-- 🗃️ {n_urls} scraped urls in total...")
|
|
179
|
+
print("scrapping is over...")
|
|
157
180
|
return
|
|
158
181
|
for index, url in urls.iterrows():
|
|
159
|
-
scrap_url(url)
|
|
160
|
-
|
|
161
|
-
wait = random.randint(15, 20)
|
|
162
182
|
wait = random.randint(1, 3)
|
|
163
|
-
print(
|
|
183
|
+
print(
|
|
184
|
+
"🐶 Scrapper is sleeping for", wait, "seconds before scraping next url..."
|
|
185
|
+
)
|
|
164
186
|
time.sleep(wait)
|
|
165
187
|
|
|
188
|
+
print("🐕 Scrapper is sniffing the url...")
|
|
189
|
+
scrap_url(url=url, verbose=verbose)
|
|
190
|
+
|
|
191
|
+
n_urls = n_urls + len(urls)
|
|
192
|
+
print(f"-- 🗃️ {n_urls} scraped urls...")
|
|
166
193
|
classify_urls.classify_urls()
|
|
167
194
|
if recursive:
|
|
168
195
|
wait = random.randint(5, 10)
|
|
169
|
-
print(
|
|
196
|
+
print(
|
|
197
|
+
f"🐶 Scrapper is sleeping for {wait} seconds before next round of {limit} urls"
|
|
198
|
+
)
|
|
170
199
|
time.sleep(wait)
|
|
171
200
|
scrap_urls(
|
|
172
201
|
recursive=recursive,
|
|
173
202
|
ignore_valid_prefix=ignore_valid_prefix,
|
|
174
203
|
randomize=randomize,
|
|
175
204
|
only_parents=only_parents,
|
|
205
|
+
verbose=verbose,
|
|
206
|
+
n_urls=n_urls,
|
|
176
207
|
)
|
|
177
208
|
else:
|
|
178
|
-
print("
|
|
209
|
+
print("scrapping is over...")
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
2
|
import math
|
|
3
|
+
import os
|
|
3
4
|
from rich.console import Console
|
|
4
5
|
from rich.table import Table
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
|
|
9
|
+
if not os.path.exists("output"):
|
|
10
|
+
os.mkdir("output")
|
|
8
11
|
df = urls_manager.get_urls(limit=limit)
|
|
9
12
|
|
|
10
13
|
if simplify:
|
|
@@ -12,7 +15,7 @@ def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
|
|
|
12
15
|
|
|
13
16
|
df.to_csv(csv_file, index=False)
|
|
14
17
|
print("--------------------")
|
|
15
|
-
print("Urls exported to", csv_file)
|
|
18
|
+
print("📊🖋️ Urls exported to", csv_file)
|
|
16
19
|
|
|
17
20
|
df.replace(
|
|
18
21
|
{
|
|
@@ -22,17 +25,19 @@ def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
|
|
|
22
25
|
inplace=True,
|
|
23
26
|
)
|
|
24
27
|
df.to_html(csv_file + "-preview.html", index=False)
|
|
25
|
-
print("Urls preview exported to", csv_file + "-preview.html")
|
|
28
|
+
print("📜🖋️ Urls preview exported to", csv_file + "-preview.html")
|
|
26
29
|
print("--------------------")
|
|
27
30
|
|
|
28
31
|
|
|
29
32
|
def export_report(csv_file="output/report.csv"):
|
|
33
|
+
if not os.path.exists("output"):
|
|
34
|
+
os.mkdir("output")
|
|
30
35
|
df = urls_manager.get_urls_report()
|
|
31
36
|
|
|
32
37
|
df.to_csv(csv_file, index=False)
|
|
33
38
|
_clear_file(csv_file)
|
|
34
39
|
print("--------------------")
|
|
35
|
-
print("Urls report exported to", csv_file)
|
|
40
|
+
print("📊🖋️ Urls report exported to", csv_file)
|
|
36
41
|
|
|
37
42
|
df.replace(
|
|
38
43
|
{
|
|
@@ -44,9 +49,10 @@ def export_report(csv_file="output/report.csv"):
|
|
|
44
49
|
df.to_html(csv_file + "-preview.html", index=False)
|
|
45
50
|
_clear_file(csv_file + "-preview.html")
|
|
46
51
|
|
|
47
|
-
print("Urls report preview exported to", csv_file + "-preview.html")
|
|
52
|
+
print("📜🖋️ Urls report preview exported to", csv_file + "-preview.html")
|
|
48
53
|
print("--------------------")
|
|
49
54
|
|
|
55
|
+
|
|
50
56
|
# TODO: Add transformation layer
|
|
51
57
|
def _clear_file(txt_tile):
|
|
52
58
|
with open(txt_tile, "r") as f:
|
|
@@ -56,6 +62,7 @@ def _clear_file(txt_tile):
|
|
|
56
62
|
with open(txt_tile, "w") as f:
|
|
57
63
|
f.write(content)
|
|
58
64
|
|
|
65
|
+
|
|
59
66
|
def show_urls(limit=0, jump_to_page=0):
|
|
60
67
|
df = urls_manager.get_urls(limit=limit)
|
|
61
68
|
df.drop(columns=["json", "description"], inplace=True)
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
-
import pandas as pd
|
|
3
|
-
import time
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def classify_urls(recursive=False):
|
|
7
|
-
urls_manager.seeds()
|
|
8
|
-
df = urls_manager.get_urls_valid_prefix()
|
|
9
|
-
|
|
10
|
-
keep_alive = True
|
|
11
|
-
while keep_alive:
|
|
12
|
-
print("waking up!")
|
|
13
|
-
for index, row_prefix in df.iterrows():
|
|
14
|
-
df_urls = urls_manager.get_url_like_unclassified(like_condition=row_prefix["url_prefix"])
|
|
15
|
-
for index, row_urls in df_urls.iterrows():
|
|
16
|
-
urls_manager.set_url_type_by_id(url_id =row_urls["id"], url_type=row_prefix["url_type"])
|
|
17
|
-
|
|
18
|
-
if not recursive:
|
|
19
|
-
print("ending...")
|
|
20
|
-
keep_alive = False
|
|
21
|
-
else:
|
|
22
|
-
print("sleeping...")
|
|
23
|
-
time.sleep(10)
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from urlextract import URLExtract
|
|
3
|
-
import ohmyscrapper.models.urls_manager as urls_manager
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def load_txt(file_name="input/_chat.txt"):
|
|
7
|
-
|
|
8
|
-
if not os.path.exists("input"):
|
|
9
|
-
os.mkdir("input")
|
|
10
|
-
|
|
11
|
-
urls_manager.create_tables()
|
|
12
|
-
urls_manager.seeds()
|
|
13
|
-
# make it recursive for all files
|
|
14
|
-
text_file_content = open(file_name, "r").read()
|
|
15
|
-
|
|
16
|
-
put_urls_from_string(text_to_process=text_file_content)
|
|
17
|
-
|
|
18
|
-
# move_it_to_processed
|
|
19
|
-
print("--------------------")
|
|
20
|
-
print(file_name, "processed")
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def put_urls_from_string(text_to_process, parent_url=None):
|
|
24
|
-
if isinstance(text_to_process, str):
|
|
25
|
-
extractor = URLExtract()
|
|
26
|
-
for url in extractor.find_urls(text_to_process):
|
|
27
|
-
urls_manager.add_url(url=url, parent_url=parent_url)
|
|
28
|
-
print(url, "added")
|
|
29
|
-
|
|
30
|
-
return len(extractor.find_urls(text_to_process))
|
|
31
|
-
else:
|
|
32
|
-
return 0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|