ohmyscrapper 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ohmyscrapper/__init__.py +162 -0
- ohmyscrapper/__main__.py +4 -0
- ohmyscrapper/models/urls_manager.py +400 -0
- ohmyscrapper/modules/classify_urls.py +23 -0
- ohmyscrapper/modules/load_txt.py +32 -0
- ohmyscrapper/modules/merge_dbs.py +6 -0
- ohmyscrapper/modules/process_with_ai.py +197 -0
- ohmyscrapper/modules/scrap_urls.py +178 -0
- ohmyscrapper/modules/seed.py +7 -0
- ohmyscrapper/modules/show.py +127 -0
- ohmyscrapper/modules/sniff_url.py +88 -0
- ohmyscrapper/modules/untouch_all.py +7 -0
- ohmyscrapper-0.2.1.dist-info/METADATA +110 -0
- ohmyscrapper-0.2.1.dist-info/RECORD +16 -0
- ohmyscrapper-0.2.1.dist-info/WHEEL +4 -0
- ohmyscrapper-0.2.1.dist-info/entry_points.txt +3 -0
ohmyscrapper/__init__.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
from ohmyscrapper.modules.classify_urls import classify_urls
|
|
4
|
+
from ohmyscrapper.modules.sniff_url import sniff_url
|
|
5
|
+
from ohmyscrapper.modules.load_txt import load_txt
|
|
6
|
+
from ohmyscrapper.modules.seed import seed
|
|
7
|
+
from ohmyscrapper.modules.scrap_urls import scrap_urls
|
|
8
|
+
from ohmyscrapper.modules.show import (
|
|
9
|
+
show_url,
|
|
10
|
+
show_urls,
|
|
11
|
+
show_urls_valid_prefix,
|
|
12
|
+
export_urls,
|
|
13
|
+
export_report,
|
|
14
|
+
)
|
|
15
|
+
from ohmyscrapper.modules.untouch_all import untouch_all
|
|
16
|
+
from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
|
|
17
|
+
from ohmyscrapper.modules.merge_dbs import merge_dbs
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def main():
|
|
21
|
+
parser = argparse.ArgumentParser(prog="ohmyscrapper")
|
|
22
|
+
parser.add_argument("--version", action="version", version="%(prog)s v0.2.1")
|
|
23
|
+
|
|
24
|
+
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
25
|
+
|
|
26
|
+
ai_process_parser = subparsers.add_parser(
|
|
27
|
+
"process-with-ai", help="Process with AI."
|
|
28
|
+
)
|
|
29
|
+
ai_process_parser.add_argument(
|
|
30
|
+
"--history", default=False, help="Reprocess ai history", action="store_true"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
seed_parser = subparsers.add_parser(
|
|
34
|
+
"seed", help="Seed database. Necessary to classify urls."
|
|
35
|
+
)
|
|
36
|
+
untouch_parser = subparsers.add_parser(
|
|
37
|
+
"untouch-all", help="Untouch all urls. That resets classification"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
classify_urls_parser = subparsers.add_parser(
|
|
41
|
+
"classify-urls", help="Classify loaded urls"
|
|
42
|
+
)
|
|
43
|
+
classify_urls_parser.add_argument(
|
|
44
|
+
"--recursive", default=False, help="Run in recursive mode", action="store_true"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
load_txt_parser = subparsers.add_parser("load", help="Load txt file")
|
|
48
|
+
load_txt_parser.add_argument(
|
|
49
|
+
"-file", default="input/_chat.txt", help="File path. Default is input/_chat.txt"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
scrap_urls_parser = subparsers.add_parser("scrap-urls", help="Scrap urls")
|
|
53
|
+
scrap_urls_parser.add_argument(
|
|
54
|
+
"--recursive", default=False, help="Run in recursive mode", action="store_true"
|
|
55
|
+
)
|
|
56
|
+
scrap_urls_parser.add_argument(
|
|
57
|
+
"--ignore-type", default=False, help="Ignore urls types", action="store_true"
|
|
58
|
+
)
|
|
59
|
+
scrap_urls_parser.add_argument(
|
|
60
|
+
"--randomize", default=False, help="Random order", action="store_true"
|
|
61
|
+
)
|
|
62
|
+
scrap_urls_parser.add_argument(
|
|
63
|
+
"--only-parents", default=False, help="Only parents urls", action="store_true"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
sniff_url_parser = subparsers.add_parser("sniff-url", help="Check url")
|
|
67
|
+
sniff_url_parser.add_argument(
|
|
68
|
+
"url", default="https://cesarcardoso.cc/", help="Url to sniff"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
show_urls_parser = subparsers.add_parser("show", help="Show urls and prefixes")
|
|
72
|
+
show_urls_parser.add_argument(
|
|
73
|
+
"--prefixes", default=False, help="Show urls valid prefix", action="store_true"
|
|
74
|
+
)
|
|
75
|
+
show_urls_parser.add_argument("--limit", default=0, help="Limit of lines to show")
|
|
76
|
+
show_urls_parser.add_argument("-url", default="", help="Url to show")
|
|
77
|
+
|
|
78
|
+
export_parser = subparsers.add_parser("export", help="Export urls to csv.")
|
|
79
|
+
export_parser.add_argument("--limit", default=0, help="Limit of lines to export")
|
|
80
|
+
export_parser.add_argument(
|
|
81
|
+
"--file",
|
|
82
|
+
default="output/urls.csv",
|
|
83
|
+
help="File path. Default is output/urls.csv",
|
|
84
|
+
)
|
|
85
|
+
export_parser.add_argument(
|
|
86
|
+
"--simplify",
|
|
87
|
+
default=False,
|
|
88
|
+
help="Ignore json and descriptions",
|
|
89
|
+
action="store_true",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
report_parser = subparsers.add_parser("report", help="Export urls report to csv.")
|
|
93
|
+
merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
|
|
94
|
+
|
|
95
|
+
# TODO: What is that?
|
|
96
|
+
# seed_parser.set_defaults(func=seed)
|
|
97
|
+
# classify_urls_parser.set_defaults(func=classify_urls)
|
|
98
|
+
# load_txt_parser.set_defaults(func=load_txt)
|
|
99
|
+
|
|
100
|
+
args = parser.parse_args()
|
|
101
|
+
|
|
102
|
+
if args.command == "classify-urls":
|
|
103
|
+
classify_urls(args.recursive)
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
if args.command == "load":
|
|
107
|
+
load_txt(args.file)
|
|
108
|
+
return
|
|
109
|
+
|
|
110
|
+
if args.command == "seed":
|
|
111
|
+
seed()
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
if args.command == "untouch-all":
|
|
115
|
+
untouch_all()
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
if args.command == "sniff-url":
|
|
119
|
+
sniff_url(args.url)
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
if args.command == "scrap-urls":
|
|
123
|
+
scrap_urls(
|
|
124
|
+
recursive=args.recursive,
|
|
125
|
+
ignore_valid_prefix=args.ignore_type,
|
|
126
|
+
randomize=args.randomize,
|
|
127
|
+
only_parents=args.only_parents,
|
|
128
|
+
)
|
|
129
|
+
return
|
|
130
|
+
|
|
131
|
+
if args.command == "show":
|
|
132
|
+
if args.prefixes:
|
|
133
|
+
show_urls_valid_prefix(int(args.limit))
|
|
134
|
+
return
|
|
135
|
+
if args.url != "":
|
|
136
|
+
show_url(args.url)
|
|
137
|
+
return
|
|
138
|
+
show_urls(int(args.limit))
|
|
139
|
+
return
|
|
140
|
+
|
|
141
|
+
if args.command == "export":
|
|
142
|
+
export_urls(limit=int(args.limit), csv_file=args.file, simplify=args.simplify)
|
|
143
|
+
return
|
|
144
|
+
|
|
145
|
+
if args.command == "process-with-ai":
|
|
146
|
+
if args.history:
|
|
147
|
+
reprocess_ai_history()
|
|
148
|
+
else:
|
|
149
|
+
process_with_ai()
|
|
150
|
+
return
|
|
151
|
+
|
|
152
|
+
if args.command == "report":
|
|
153
|
+
export_report()
|
|
154
|
+
return
|
|
155
|
+
|
|
156
|
+
if args.command == "merge_dbs":
|
|
157
|
+
merge_dbs()
|
|
158
|
+
return
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
if __name__ == "__main__":
|
|
162
|
+
main()
|
ohmyscrapper/__main__.py
ADDED
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sqlite3
|
|
3
|
+
import time
|
|
4
|
+
import glob
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from urllib.parse import urlparse, urlunparse
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_db_dir():
|
|
10
|
+
if not os.path.exists("db"):
|
|
11
|
+
os.mkdir("db")
|
|
12
|
+
return "db"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_db_path():
|
|
16
|
+
return get_db_dir() + "/local.db"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_db_connection():
|
|
20
|
+
return sqlite3.connect(get_db_path())
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# TODO: check if it makes sense
|
|
24
|
+
conn = get_db_connection()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def create_tables():
|
|
28
|
+
|
|
29
|
+
c = conn.cursor()
|
|
30
|
+
c.execute(
|
|
31
|
+
"CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, h1 TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
|
|
32
|
+
)
|
|
33
|
+
c.execute(
|
|
34
|
+
"CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, prompt_file STRING, prompt_name STRING, created_at DATETIME)"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
c.execute(
|
|
38
|
+
"CREATE TABLE IF NOT EXISTS urls_valid_prefix (id INTEGER PRIMARY KEY, url_prefix TEXT UNIQUE, url_type TEXT)"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
return pd.read_sql_query("SELECT * FROM urls LIMIT 100", conn)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# TODO: not sure this should be something. depends on the project
|
|
45
|
+
def seeds():
|
|
46
|
+
create_tables()
|
|
47
|
+
|
|
48
|
+
add_urls_valid_prefix("https://%.linkedin.com/posts/%", "linkedin_post")
|
|
49
|
+
add_urls_valid_prefix("https://lnkd.in/%", "linkedin_redirect")
|
|
50
|
+
add_urls_valid_prefix("https://%.linkedin.com/jobs/view/%", "linkedin_job")
|
|
51
|
+
add_urls_valid_prefix("https://%.linkedin.com/feed/%", "linkedin_feed")
|
|
52
|
+
add_urls_valid_prefix("https://%.linkedin.com/company/%", "linkedin_company")
|
|
53
|
+
|
|
54
|
+
# add_urls_valid_prefix("%.pdf", "pdf")
|
|
55
|
+
# add_url('https://imazon.org.br/categorias/artigos-cientificos/')
|
|
56
|
+
|
|
57
|
+
return True
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def add_urls_valid_prefix(url_prefix, url_type):
|
|
61
|
+
conn = get_db_connection()
|
|
62
|
+
|
|
63
|
+
df = pd.read_sql_query(
|
|
64
|
+
f"SELECT * FROM urls_valid_prefix WHERE url_prefix = '{url_prefix}'", conn
|
|
65
|
+
)
|
|
66
|
+
if len(df) == 0:
|
|
67
|
+
c = conn.cursor()
|
|
68
|
+
c.execute(
|
|
69
|
+
"INSERT INTO urls_valid_prefix (url_prefix, url_type) VALUES (?, ?)",
|
|
70
|
+
(url_prefix, url_type),
|
|
71
|
+
)
|
|
72
|
+
conn.commit()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_urls_valid_prefix_by_type(url_type):
|
|
76
|
+
df = pd.read_sql_query(
|
|
77
|
+
f"SELECT * FROM urls_valid_prefix WHERE url_type = '{url_type}'", conn
|
|
78
|
+
)
|
|
79
|
+
return df
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def get_urls_valid_prefix_by_id(id):
|
|
83
|
+
df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix WHERE id = '{id}'", conn)
|
|
84
|
+
return df
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# TODO: pagination required
|
|
88
|
+
def get_urls_valid_prefix(limit=0):
|
|
89
|
+
if limit > 0:
|
|
90
|
+
df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix LIMIT {limit}", conn)
|
|
91
|
+
else:
|
|
92
|
+
df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix", conn)
|
|
93
|
+
return df
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# TODO: pagination required
|
|
97
|
+
def get_urls(limit=0):
|
|
98
|
+
if limit > 0:
|
|
99
|
+
df = pd.read_sql_query(
|
|
100
|
+
f"SELECT * FROM urls LIMIT {limit} ORDER BY history ASC", conn
|
|
101
|
+
)
|
|
102
|
+
else:
|
|
103
|
+
df = pd.read_sql_query(f"SELECT * FROM urls ORDER BY history ASC", conn)
|
|
104
|
+
return df
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_urls_report():
|
|
108
|
+
sql = """
|
|
109
|
+
WITH parent_url AS (
|
|
110
|
+
SELECT parent_url FROM urls WHERE parent_url IS NOT NULL AND parent_url != '' GROUP BY parent_url
|
|
111
|
+
),
|
|
112
|
+
parents AS (
|
|
113
|
+
SELECT
|
|
114
|
+
u.id,
|
|
115
|
+
u.url,
|
|
116
|
+
u.h1
|
|
117
|
+
FROM urls u
|
|
118
|
+
INNER JOIN parent_url p
|
|
119
|
+
ON u.url = p.parent_url
|
|
120
|
+
)
|
|
121
|
+
SELECT
|
|
122
|
+
u.id,
|
|
123
|
+
u.url_type,
|
|
124
|
+
u.url,
|
|
125
|
+
COALESCE(u.h1, p.h1) as h1,
|
|
126
|
+
p.url as parent_url,
|
|
127
|
+
p.h1 as parent_h1
|
|
128
|
+
FROM urls u
|
|
129
|
+
LEFT JOIN parents p
|
|
130
|
+
ON u.parent_url = p.url
|
|
131
|
+
WHERE
|
|
132
|
+
u.history = 0
|
|
133
|
+
AND u.url NOT IN (SELECT url FROM parents)
|
|
134
|
+
ORDER BY url_type DESC
|
|
135
|
+
"""
|
|
136
|
+
df = pd.read_sql_query(sql, conn)
|
|
137
|
+
|
|
138
|
+
return df
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def get_url_by_url(url):
|
|
142
|
+
url = clean_url(url)
|
|
143
|
+
df = pd.read_sql_query(f"SELECT * FROM urls WHERE url = '{url}'", conn)
|
|
144
|
+
|
|
145
|
+
return df
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def get_url_by_id(id):
|
|
149
|
+
df = pd.read_sql_query(f"SELECT * FROM urls WHERE id = '{id}'", conn)
|
|
150
|
+
|
|
151
|
+
return df
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def get_urls_by_url_type(url_type):
|
|
155
|
+
df = pd.read_sql_query(
|
|
156
|
+
f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}'", conn
|
|
157
|
+
)
|
|
158
|
+
return df
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
|
|
162
|
+
df = pd.read_sql_query(
|
|
163
|
+
f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}' AND ai_processed = 0 LIMIT {limit}",
|
|
164
|
+
conn,
|
|
165
|
+
)
|
|
166
|
+
return df
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def get_url_like_unclassified(like_condition):
|
|
170
|
+
df = pd.read_sql_query(
|
|
171
|
+
f"SELECT * FROM urls WHERE history = 0 AND url LIKE '{like_condition}' AND url_type IS NULL",
|
|
172
|
+
conn,
|
|
173
|
+
)
|
|
174
|
+
return df
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def add_url(url, h1=None, parent_url=None):
|
|
178
|
+
url = clean_url(url)
|
|
179
|
+
c = conn.cursor()
|
|
180
|
+
|
|
181
|
+
if h1 is not None:
|
|
182
|
+
h1 = h1.strip()
|
|
183
|
+
|
|
184
|
+
if parent_url is None:
|
|
185
|
+
parent_url = None
|
|
186
|
+
|
|
187
|
+
parent_url = str(parent_url)
|
|
188
|
+
|
|
189
|
+
if len(get_url_by_url(url)) == 0:
|
|
190
|
+
c.execute(
|
|
191
|
+
"INSERT INTO urls (url, h1, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
|
|
192
|
+
(url, h1, parent_url, int(time.time())),
|
|
193
|
+
)
|
|
194
|
+
conn.commit()
|
|
195
|
+
|
|
196
|
+
return get_url_by_url(url)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def add_ai_log(instructions, response, model, prompt_file, prompt_name):
|
|
200
|
+
c = conn.cursor()
|
|
201
|
+
|
|
202
|
+
c.execute(
|
|
203
|
+
"INSERT INTO ai_log (instructions, response, model, prompt_file, prompt_name, created_at) VALUES (?, ?, ?, ?, ?, ?)",
|
|
204
|
+
(instructions, response, model, prompt_file, prompt_name, int(time.time())),
|
|
205
|
+
)
|
|
206
|
+
conn.commit()
|
|
207
|
+
|
|
208
|
+
def get_ai_log():
|
|
209
|
+
df = pd.read_sql_query(f"SELECT * FROM ai_log", conn)
|
|
210
|
+
return df
|
|
211
|
+
|
|
212
|
+
def set_url_destiny(url, destiny):
|
|
213
|
+
url = clean_url(url)
|
|
214
|
+
destiny = clean_url(destiny)
|
|
215
|
+
c = conn.cursor()
|
|
216
|
+
c.execute("UPDATE urls SET url_destiny = ? WHERE url = ?", (destiny, url))
|
|
217
|
+
c.execute(
|
|
218
|
+
"UPDATE urls SET parent_url = ? WHERE url = ?",
|
|
219
|
+
(str(url), destiny),
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
conn.commit()
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def set_url_h1(url, value):
|
|
226
|
+
value = str(value).strip()
|
|
227
|
+
url = clean_url(url)
|
|
228
|
+
c = conn.cursor()
|
|
229
|
+
c.execute("UPDATE urls SET h1 = ? WHERE url = ?", (value, url))
|
|
230
|
+
conn.commit()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def set_url_h1_by_id(id, value):
|
|
234
|
+
value = str(value).strip()
|
|
235
|
+
|
|
236
|
+
c = conn.cursor()
|
|
237
|
+
c.execute("UPDATE urls SET h1 = ? WHERE id = ?", (value, id))
|
|
238
|
+
conn.commit()
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def set_url_ai_processed_by_id(id, json_str):
|
|
242
|
+
value = 1
|
|
243
|
+
value = str(value).strip()
|
|
244
|
+
c = conn.cursor()
|
|
245
|
+
c.execute("UPDATE urls SET ai_processed = ? , json_ai = ? WHERE id = ?", (value, json_str, id))
|
|
246
|
+
conn.commit()
|
|
247
|
+
|
|
248
|
+
def set_url_empty_ai_processed_by_id(id, json_str="empty result"):
|
|
249
|
+
value = 1
|
|
250
|
+
value = str(value).strip()
|
|
251
|
+
c = conn.cursor()
|
|
252
|
+
c.execute("UPDATE urls SET ai_processed = ? , json_ai = ? WHERE ai_processed = 0 AND id = ?", (value, json_str, id))
|
|
253
|
+
conn.commit()
|
|
254
|
+
|
|
255
|
+
def set_url_ai_processed_by_url(url, json_str):
|
|
256
|
+
value = 1
|
|
257
|
+
value = str(value).strip()
|
|
258
|
+
url = clean_url(url)
|
|
259
|
+
c = conn.cursor()
|
|
260
|
+
c.execute("UPDATE urls SET ai_processed = ?, json_ai = ? WHERE url = ?", (value, json_str, url))
|
|
261
|
+
conn.commit()
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def set_url_description(url, value):
|
|
265
|
+
url = clean_url(url)
|
|
266
|
+
c = conn.cursor()
|
|
267
|
+
c.execute("UPDATE urls SET description = ? WHERE url = ?", (value, url))
|
|
268
|
+
conn.commit()
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def set_url_description_links(url, value):
|
|
272
|
+
url = clean_url(url)
|
|
273
|
+
c = conn.cursor()
|
|
274
|
+
c.execute("UPDATE urls SET description_links = ? WHERE url = ?", (value, url))
|
|
275
|
+
conn.commit()
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def set_url_json(url, value):
|
|
279
|
+
url = clean_url(url)
|
|
280
|
+
c = conn.cursor()
|
|
281
|
+
c.execute("UPDATE urls SET json = ? WHERE url = ?", (value, url))
|
|
282
|
+
conn.commit()
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def set_url_error(url, value):
|
|
286
|
+
url = clean_url(url)
|
|
287
|
+
c = conn.cursor()
|
|
288
|
+
c.execute("UPDATE urls SET error = ? WHERE url = ?", (value, url))
|
|
289
|
+
conn.commit()
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def set_url_type_by_id(url_id, url_type):
|
|
293
|
+
c = conn.cursor()
|
|
294
|
+
c.execute(f"UPDATE urls SET url_type = '{url_type}' WHERE id = {url_id}")
|
|
295
|
+
conn.commit()
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def clean_url(url):
|
|
299
|
+
if url[0:7] == "http://":
|
|
300
|
+
url = "https://" + url[7:]
|
|
301
|
+
|
|
302
|
+
if url[0:8] != "https://":
|
|
303
|
+
url = "https://" + url
|
|
304
|
+
url = url.split("#")[0]
|
|
305
|
+
old_query = urlparse(url).query.split("&")
|
|
306
|
+
new_query = []
|
|
307
|
+
for i in old_query:
|
|
308
|
+
if i[0:4] != "utm_":
|
|
309
|
+
new_query.append(i)
|
|
310
|
+
|
|
311
|
+
url = urlunparse(urlparse(url)._replace(query="&".join(new_query))).replace("'", "")
|
|
312
|
+
return url
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def get_untouched_urls(
|
|
316
|
+
limit=10, randomize=True, ignore_valid_prefix=False, only_parents=True
|
|
317
|
+
):
|
|
318
|
+
where_sql = ""
|
|
319
|
+
if not ignore_valid_prefix:
|
|
320
|
+
where_sql += " AND url_type IS NOT NULL "
|
|
321
|
+
|
|
322
|
+
if only_parents:
|
|
323
|
+
where_sql += " AND (parent_url = '' OR parent_url IS NULL) "
|
|
324
|
+
|
|
325
|
+
if randomize:
|
|
326
|
+
random_sql = " RANDOM() "
|
|
327
|
+
else:
|
|
328
|
+
random_sql = " created_at DESC "
|
|
329
|
+
sql = f"SELECT * FROM urls WHERE 1 = 1 AND history = 0 {where_sql} AND last_touch IS NULL ORDER BY {random_sql} LIMIT {limit}"
|
|
330
|
+
df = pd.read_sql_query(sql, conn)
|
|
331
|
+
return df
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def touch_url(url):
|
|
335
|
+
url = clean_url(url)
|
|
336
|
+
c = conn.cursor()
|
|
337
|
+
c.execute("UPDATE urls SET last_touch = ? WHERE url = ?", (int(time.time()), url))
|
|
338
|
+
conn.commit()
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def untouch_url(url):
|
|
342
|
+
url = clean_url(url)
|
|
343
|
+
c = conn.cursor()
|
|
344
|
+
c.execute("UPDATE urls SET last_touch = NULL WHERE url = ?", (url))
|
|
345
|
+
conn.commit()
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def untouch_all_urls():
|
|
349
|
+
c = conn.cursor()
|
|
350
|
+
c.execute("UPDATE urls SET last_touch = NULL WHERE history = 0")
|
|
351
|
+
conn.commit()
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def set_all_urls_as_history():
|
|
355
|
+
c = conn.cursor()
|
|
356
|
+
c.execute("UPDATE urls SET history = 1")
|
|
357
|
+
conn.commit()
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def merge_dbs() -> None:
|
|
361
|
+
production_db_file = get_db_path()
|
|
362
|
+
db_number = -1
|
|
363
|
+
dir = get_db_dir()
|
|
364
|
+
list_of_files = glob.glob(dir + "/*.db")
|
|
365
|
+
list_of_files.remove(production_db_file)
|
|
366
|
+
if len(list_of_files) > 0:
|
|
367
|
+
print("\nAvailable dbs:")
|
|
368
|
+
for index, file in enumerate(list_of_files):
|
|
369
|
+
print(index, ":", file)
|
|
370
|
+
while db_number < 0 or db_number >= len(list_of_files):
|
|
371
|
+
db_number = int(input("Choose the db to merge: "))
|
|
372
|
+
|
|
373
|
+
print(list_of_files[db_number])
|
|
374
|
+
source_conn = sqlite3.connect(list_of_files[db_number])
|
|
375
|
+
df = pd.read_sql_query("SELECT * FROM urls", source_conn)
|
|
376
|
+
for index, row in df.iterrows():
|
|
377
|
+
merge_url(
|
|
378
|
+
row["url"],
|
|
379
|
+
f"merged from {list_of_files[db_number]}",
|
|
380
|
+
row["last_touch"],
|
|
381
|
+
row["created_at"],
|
|
382
|
+
row["description"],
|
|
383
|
+
row["json"],
|
|
384
|
+
)
|
|
385
|
+
# ßmerge_url(df)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def merge_url(url, h1, last_touch, created_at, description, json):
|
|
389
|
+
url = clean_url(url)
|
|
390
|
+
c = conn.cursor()
|
|
391
|
+
|
|
392
|
+
if h1 is not None:
|
|
393
|
+
h1 = h1.strip()
|
|
394
|
+
|
|
395
|
+
if len(get_url_by_url(url)) == 0:
|
|
396
|
+
c.execute(
|
|
397
|
+
"INSERT INTO urls (url, h1, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
|
|
398
|
+
(url, h1, last_touch, created_at, description, json),
|
|
399
|
+
)
|
|
400
|
+
conn.commit()
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def classify_urls(recursive=False):
|
|
7
|
+
urls_manager.seeds()
|
|
8
|
+
df = urls_manager.get_urls_valid_prefix()
|
|
9
|
+
|
|
10
|
+
keep_alive = True
|
|
11
|
+
while keep_alive:
|
|
12
|
+
print("waking up!")
|
|
13
|
+
for index, row_prefix in df.iterrows():
|
|
14
|
+
df_urls = urls_manager.get_url_like_unclassified(like_condition=row_prefix["url_prefix"])
|
|
15
|
+
for index, row_urls in df_urls.iterrows():
|
|
16
|
+
urls_manager.set_url_type_by_id(url_id =row_urls["id"], url_type=row_prefix["url_type"])
|
|
17
|
+
|
|
18
|
+
if not recursive:
|
|
19
|
+
print("ending...")
|
|
20
|
+
keep_alive = False
|
|
21
|
+
else:
|
|
22
|
+
print("sleeping...")
|
|
23
|
+
time.sleep(10)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from urlextract import URLExtract
|
|
3
|
+
import ohmyscrapper.models.urls_manager as urls_manager
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def load_txt(file_name="input/_chat.txt"):
|
|
7
|
+
|
|
8
|
+
if not os.path.exists("input"):
|
|
9
|
+
os.mkdir("input")
|
|
10
|
+
|
|
11
|
+
urls_manager.create_tables()
|
|
12
|
+
urls_manager.seeds()
|
|
13
|
+
# make it recursive for all files
|
|
14
|
+
text_file_content = open(file_name, "r").read()
|
|
15
|
+
|
|
16
|
+
put_urls_from_string(text_to_process=text_file_content)
|
|
17
|
+
|
|
18
|
+
# move_it_to_processed
|
|
19
|
+
print("--------------------")
|
|
20
|
+
print(file_name, "processed")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def put_urls_from_string(text_to_process, parent_url=None):
|
|
24
|
+
if isinstance(text_to_process, str):
|
|
25
|
+
extractor = URLExtract()
|
|
26
|
+
for url in extractor.find_urls(text_to_process):
|
|
27
|
+
urls_manager.add_url(url=url, parent_url=parent_url)
|
|
28
|
+
print(url, "added")
|
|
29
|
+
|
|
30
|
+
return len(extractor.find_urls(text_to_process))
|
|
31
|
+
else:
|
|
32
|
+
return 0
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
from bs4 import BeautifulSoup
|
|
3
|
+
from google import genai
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
import random
|
|
6
|
+
import time
|
|
7
|
+
import os
|
|
8
|
+
import yaml
|
|
9
|
+
import json
|
|
10
|
+
# TODO: !!! REFACTOR !!!
|
|
11
|
+
load_dotenv()
|
|
12
|
+
|
|
13
|
+
def reprocess_ai_history():
|
|
14
|
+
df = urls_manager.get_ai_log().to_dict(orient="records")
|
|
15
|
+
for row in df:
|
|
16
|
+
process_ai_response(row["response"])
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def process_ai_response(response):
|
|
20
|
+
job_positions = xml2dict(response)
|
|
21
|
+
|
|
22
|
+
for index, xml_item_children in job_positions.items():
|
|
23
|
+
for url_child_xml in xml_item_children:
|
|
24
|
+
|
|
25
|
+
url_parent = urls_manager.get_url_by_id(url_child_xml["id"])
|
|
26
|
+
if len(url_parent) > 0:
|
|
27
|
+
url_parent = url_parent.iloc[0]
|
|
28
|
+
h1 = url_child_xml.copy()
|
|
29
|
+
del h1["id"]
|
|
30
|
+
del h1["url"]
|
|
31
|
+
h1 = " - ".join(h1.values())
|
|
32
|
+
if url_parent["description_links"] > 1 and url_child_xml["id"] != "":
|
|
33
|
+
print("-- child updated -- \n", url_child_xml["url"] , ":", h1)
|
|
34
|
+
urls_manager.set_url_h1(url_child_xml["url"], h1)
|
|
35
|
+
urls_manager.set_url_ai_processed_by_url(url_child_xml["url"], str(json.dumps(url_child_xml)))
|
|
36
|
+
if url_parent["url"] != url_child_xml["url"]:
|
|
37
|
+
urls_manager.set_url_ai_processed_by_url(url_parent["url"], "children-update")
|
|
38
|
+
else:
|
|
39
|
+
print("-- parent updated -- \n", url_parent["url"], ":", h1)
|
|
40
|
+
urls_manager.set_url_h1(url_parent["url"], h1)
|
|
41
|
+
urls_manager.set_url_ai_processed_by_url(url_parent["url"], str(json.dumps(url_child_xml)))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def xml2dict(xml_string):
|
|
45
|
+
soup = BeautifulSoup(xml_string, "html.parser")
|
|
46
|
+
|
|
47
|
+
children_items_dict = {}
|
|
48
|
+
for item in soup.find_all():
|
|
49
|
+
if(item.parent.name == "[document]"):
|
|
50
|
+
children_items_dict[item.name] = []
|
|
51
|
+
elif item.parent.name in children_items_dict:
|
|
52
|
+
children_items_dict[item.parent.name].append(_xml_children_to_dict(item))
|
|
53
|
+
|
|
54
|
+
return children_items_dict
|
|
55
|
+
|
|
56
|
+
def _xml_children_to_dict(xml):
|
|
57
|
+
item_dict = {}
|
|
58
|
+
for item in xml.find_all():
|
|
59
|
+
item_dict[item.name] = item.text
|
|
60
|
+
return item_dict
|
|
61
|
+
|
|
62
|
+
def process_with_ai(recursive=True, triggered_times=0):
|
|
63
|
+
triggered_times = triggered_times + 1
|
|
64
|
+
|
|
65
|
+
prompt = _get_prompt()
|
|
66
|
+
if not prompt:
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
url_type = "linkedin_post"
|
|
70
|
+
df = urls_manager.get_urls_by_url_type_for_ai_process(url_type)
|
|
71
|
+
if len(df) == 0:
|
|
72
|
+
print("no urls to process with ai anymore")
|
|
73
|
+
return
|
|
74
|
+
|
|
75
|
+
texts = ""
|
|
76
|
+
for index, row in df.iterrows():
|
|
77
|
+
texts = (
|
|
78
|
+
texts
|
|
79
|
+
+ f"""
|
|
80
|
+
<text>
|
|
81
|
+
<id>{str(row['id'])}</id>
|
|
82
|
+
{row['description']}
|
|
83
|
+
</text>
|
|
84
|
+
"""
|
|
85
|
+
)
|
|
86
|
+
if texts == "":
|
|
87
|
+
print("no urls to process")
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
print("starting...")
|
|
91
|
+
print("prompt:", prompt["name"])
|
|
92
|
+
print("model:", prompt["model"])
|
|
93
|
+
print("description:", prompt["description"])
|
|
94
|
+
prompt["instructions"] = prompt["instructions"].replace("{ohmyscrapper_texts}", texts)
|
|
95
|
+
|
|
96
|
+
# The client gets the API key from the environment variable `GEMINI_API_KEY`.
|
|
97
|
+
client = genai.Client()
|
|
98
|
+
response = client.models.generate_content(model=prompt["model"], contents=prompt["instructions"])
|
|
99
|
+
response = str(response.text)
|
|
100
|
+
urls_manager.add_ai_log(instructions=prompt["instructions"], response=response, model=prompt["model"], prompt_name=prompt["name"], prompt_file=prompt["prompt_file"])
|
|
101
|
+
print(response)
|
|
102
|
+
print("^^^^^^")
|
|
103
|
+
process_ai_response(response=response)
|
|
104
|
+
print("ending...")
|
|
105
|
+
|
|
106
|
+
for index, row in df.iterrows():
|
|
107
|
+
urls_manager.set_url_empty_ai_processed_by_id(row["id"])
|
|
108
|
+
|
|
109
|
+
if recursive:
|
|
110
|
+
wait = random.randint(1, 3)
|
|
111
|
+
print("sleeping for", wait, "seconds before next round")
|
|
112
|
+
time.sleep(wait)
|
|
113
|
+
|
|
114
|
+
if triggered_times > 5:
|
|
115
|
+
print("!!! This is a break to prevent budget accident$.")
|
|
116
|
+
print("You triggered", triggered_times, "times the AI processing function.")
|
|
117
|
+
print("If you are sure this is correct, you can re-call this function again.")
|
|
118
|
+
print("Please, check it.")
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
process_with_ai(recursive=recursive, triggered_times=triggered_times)
|
|
122
|
+
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
def _get_prompt():
|
|
126
|
+
prompts_path = "prompts"
|
|
127
|
+
default_prompt = """---
|
|
128
|
+
model: "gemini-2.5-flash"
|
|
129
|
+
name: "default-prompt"
|
|
130
|
+
description: "Put here your prompt description."
|
|
131
|
+
---
|
|
132
|
+
Process with AI this prompt: {ohmyscrapper_texts}
|
|
133
|
+
"""
|
|
134
|
+
if not os.path.exists(prompts_path):
|
|
135
|
+
os.mkdir(prompts_path)
|
|
136
|
+
|
|
137
|
+
open(f"{prompts_path}/prompt.md", "w").write(default_prompt)
|
|
138
|
+
print(f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there.")
|
|
139
|
+
return False
|
|
140
|
+
|
|
141
|
+
prompt_files = os.listdir(prompts_path)
|
|
142
|
+
if len(prompt_files) == 0:
|
|
143
|
+
open(f"{prompts_path}/prompt.md", "w").write(default_prompt)
|
|
144
|
+
print(f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there.")
|
|
145
|
+
return False
|
|
146
|
+
prompt = {}
|
|
147
|
+
if len(prompt_files) == 1:
|
|
148
|
+
prompt = _parse_prompt(prompts_path=prompts_path, prompt_file=prompt_files[0])
|
|
149
|
+
else:
|
|
150
|
+
print("Choose a prompt:")
|
|
151
|
+
prompts = {}
|
|
152
|
+
for index, file in enumerate(prompt_files):
|
|
153
|
+
prompts[index] = _parse_prompt(prompts_path=prompts_path, prompt_file=file)
|
|
154
|
+
print(index, ":", prompts[index]['name'])
|
|
155
|
+
input_prompt = input("Type the number of the prompt you want to use or 'q' to quit: ")
|
|
156
|
+
if input_prompt == "q":
|
|
157
|
+
return False
|
|
158
|
+
try:
|
|
159
|
+
prompt = prompts[int(input_prompt)]
|
|
160
|
+
except:
|
|
161
|
+
print("! Invalid prompt\n")
|
|
162
|
+
prompt = _get_prompt()
|
|
163
|
+
return prompt
|
|
164
|
+
|
|
165
|
+
def _parse_prompt(prompts_path, prompt_file):
|
|
166
|
+
prompt = {}
|
|
167
|
+
raw_prompt = open(f"{prompts_path}/{prompt_file}", "r").read().split("---")
|
|
168
|
+
prompt = yaml.safe_load(raw_prompt[1])
|
|
169
|
+
prompt["instructions"] = raw_prompt[2].strip()
|
|
170
|
+
prompt["prompt_file"] = prompt_file
|
|
171
|
+
|
|
172
|
+
return prompt
|
|
173
|
+
# TODO: Separate gemini from basic function
|
|
174
|
+
def _process_with_gemini(model, instructions):
|
|
175
|
+
response = """"""
|
|
176
|
+
return response
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _process_with_openai(model, instructions):
|
|
180
|
+
# import os
|
|
181
|
+
# from openai import OpenAI
|
|
182
|
+
|
|
183
|
+
# client = OpenAI(
|
|
184
|
+
# # This is the default and can be omitted
|
|
185
|
+
# api_key=os.environ.get("OPENAI_API_KEY"),
|
|
186
|
+
# )
|
|
187
|
+
|
|
188
|
+
# response = client.responses.create(
|
|
189
|
+
# model="gpt-4o",
|
|
190
|
+
# instructions="You are a coding assistant that talks like a pirate.",
|
|
191
|
+
# input="How do I check if a Python object is an instance of a class?",
|
|
192
|
+
# )
|
|
193
|
+
|
|
194
|
+
# print(response.output_text)
|
|
195
|
+
|
|
196
|
+
response = """"""
|
|
197
|
+
return response
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
import ohmyscrapper.modules.sniff_url as sniff_url
|
|
3
|
+
import ohmyscrapper.modules.load_txt as load_txt
|
|
4
|
+
import ohmyscrapper.modules.classify_urls as classify_urls
|
|
5
|
+
|
|
6
|
+
import time
|
|
7
|
+
import random
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def process_linkedin_redirect(url_report, url):
|
|
11
|
+
print("linkedin_redirect")
|
|
12
|
+
|
|
13
|
+
if url_report["total-a-links"] < 5:
|
|
14
|
+
if "first-a-link" in url_report.keys():
|
|
15
|
+
url_destiny = url_report["first-a-link"]
|
|
16
|
+
else:
|
|
17
|
+
urls_manager.set_url_error(url=url["url"], value="error: no first-a-link")
|
|
18
|
+
print("no url for:", url["url"])
|
|
19
|
+
return
|
|
20
|
+
else:
|
|
21
|
+
if "og:url" in url_report.keys():
|
|
22
|
+
url_destiny = url_report["og:url"]
|
|
23
|
+
else:
|
|
24
|
+
urls_manager.set_url_error(url=url["url"], value="error: no og:url")
|
|
25
|
+
print("no url for:", url["url"])
|
|
26
|
+
return
|
|
27
|
+
|
|
28
|
+
print(url["url"], ">>", url_destiny)
|
|
29
|
+
urls_manager.add_url(url=url_destiny)
|
|
30
|
+
urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def process_linkedin_feed(url_report, url):
|
|
34
|
+
print("linkedin_feed")
|
|
35
|
+
|
|
36
|
+
if "og:url" in url_report.keys():
|
|
37
|
+
url_destiny = url_report["og:url"]
|
|
38
|
+
else:
|
|
39
|
+
urls_manager.set_url_error(url=url["url"], value="error: no og:url")
|
|
40
|
+
print("no url for:", url["url"])
|
|
41
|
+
return
|
|
42
|
+
|
|
43
|
+
print(url["url"], ">>", url_destiny)
|
|
44
|
+
urls_manager.add_url(url=url_destiny)
|
|
45
|
+
urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def process_linkedin_job(url_report, url):
|
|
49
|
+
print("linkedin_job")
|
|
50
|
+
changed = False
|
|
51
|
+
if "h1" in url_report.keys():
|
|
52
|
+
print(url["url"], ": ", url_report["h1"])
|
|
53
|
+
urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
|
|
54
|
+
changed = True
|
|
55
|
+
elif "og:title" in url_report.keys():
|
|
56
|
+
print(url["url"], ": ", url_report["og:title"])
|
|
57
|
+
urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
|
|
58
|
+
changed = True
|
|
59
|
+
|
|
60
|
+
if "description" in url_report.keys():
|
|
61
|
+
urls_manager.set_url_description(url=url["url"], value=url_report["description"])
|
|
62
|
+
changed = True
|
|
63
|
+
elif "og:description" in url_report.keys():
|
|
64
|
+
urls_manager.set_url_description(url=url["url"], value=url_report["og:description"])
|
|
65
|
+
changed = True
|
|
66
|
+
if not changed:
|
|
67
|
+
urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def process_linkedin_post(url_report, url):
|
|
71
|
+
print("linkedin_post or generic")
|
|
72
|
+
print(url["url"])
|
|
73
|
+
changed = False
|
|
74
|
+
if "h1" in url_report.keys():
|
|
75
|
+
print(url["url"], ": ", url_report["h1"])
|
|
76
|
+
urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
|
|
77
|
+
changed = True
|
|
78
|
+
elif "og:title" in url_report.keys():
|
|
79
|
+
urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
|
|
80
|
+
changed = True
|
|
81
|
+
description = None
|
|
82
|
+
if "description" in url_report.keys():
|
|
83
|
+
description = url_report["description"]
|
|
84
|
+
changed = True
|
|
85
|
+
elif "og:description" in url_report.keys():
|
|
86
|
+
description = url_report["og:description"]
|
|
87
|
+
changed = True
|
|
88
|
+
|
|
89
|
+
if description is not None:
|
|
90
|
+
urls_manager.set_url_description(url=url["url"], value=description)
|
|
91
|
+
description_links = load_txt.put_urls_from_string(text_to_process=description, parent_url=url["url"])
|
|
92
|
+
urls_manager.set_url_description_links(url=url["url"], value=description_links)
|
|
93
|
+
|
|
94
|
+
if not changed:
|
|
95
|
+
urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def scrap_url(url):
|
|
99
|
+
# TODO: Use get_urls_valid_prefix_by_id()
|
|
100
|
+
df = urls_manager.get_urls_valid_prefix()
|
|
101
|
+
|
|
102
|
+
# TODO: Need to change this
|
|
103
|
+
|
|
104
|
+
if url["url_type"] is None:
|
|
105
|
+
print("\n\ngeneric:", url["url"])
|
|
106
|
+
url["url_type"] = "generic"
|
|
107
|
+
else:
|
|
108
|
+
print("\n\n", url["url_type"] + ":", url["url"])
|
|
109
|
+
try:
|
|
110
|
+
url_report = sniff_url.get_tags(url=url["url"])
|
|
111
|
+
except Exception as e:
|
|
112
|
+
urls_manager.set_url_error(url=url["url"], value="error")
|
|
113
|
+
urls_manager.touch_url(url=url["url"])
|
|
114
|
+
print("\n\n!!! ERROR FOR:", url["url"])
|
|
115
|
+
print(
|
|
116
|
+
"\n\n!!! you can check the URL using the command sniff-url",
|
|
117
|
+
url["url"],
|
|
118
|
+
"\n\n",
|
|
119
|
+
)
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
# linkedin_redirect - linkedin (https://lnkd.in/)
|
|
123
|
+
if url["url_type"] == "linkedin_redirect":
|
|
124
|
+
process_linkedin_redirect(url_report=url_report, url=url)
|
|
125
|
+
|
|
126
|
+
# linkedin_feed - linkedin (https://%.linkedin.com/feed/)
|
|
127
|
+
if url["url_type"] == "linkedin_feed":
|
|
128
|
+
process_linkedin_feed(url_report=url_report, url=url)
|
|
129
|
+
|
|
130
|
+
# linkedin_job - linkedin (https://www.linkedin.com/jobs/)
|
|
131
|
+
if url["url_type"] == "linkedin_job":
|
|
132
|
+
process_linkedin_job(url_report=url_report, url=url)
|
|
133
|
+
|
|
134
|
+
# linkedin_job - linkedin (https://www.linkedin.com/jobs/)
|
|
135
|
+
if url["url_type"] == "linkedin_post" or url["url_type"] == "generic":
|
|
136
|
+
process_linkedin_post(url_report=url_report, url=url)
|
|
137
|
+
|
|
138
|
+
urls_manager.set_url_json(url=url["url"], value=url_report["json"])
|
|
139
|
+
urls_manager.touch_url(url=url["url"])
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def isNaN(num):
|
|
143
|
+
return num != num
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def scrap_urls(
|
|
147
|
+
recursive=False, ignore_valid_prefix=False, randomize=False, only_parents=True
|
|
148
|
+
):
|
|
149
|
+
classify_urls.classify_urls()
|
|
150
|
+
urls = urls_manager.get_untouched_urls(
|
|
151
|
+
ignore_valid_prefix=ignore_valid_prefix,
|
|
152
|
+
randomize=randomize,
|
|
153
|
+
only_parents=only_parents,
|
|
154
|
+
)
|
|
155
|
+
if len(urls) == 0:
|
|
156
|
+
print("no urls to scrap")
|
|
157
|
+
return
|
|
158
|
+
for index, url in urls.iterrows():
|
|
159
|
+
scrap_url(url)
|
|
160
|
+
|
|
161
|
+
wait = random.randint(15, 20)
|
|
162
|
+
wait = random.randint(1, 3)
|
|
163
|
+
print("sleeping for", wait, "seconds")
|
|
164
|
+
time.sleep(wait)
|
|
165
|
+
|
|
166
|
+
classify_urls.classify_urls()
|
|
167
|
+
if recursive:
|
|
168
|
+
wait = random.randint(5, 10)
|
|
169
|
+
print("sleeping for", wait, "seconds before next round")
|
|
170
|
+
time.sleep(wait)
|
|
171
|
+
scrap_urls(
|
|
172
|
+
recursive=recursive,
|
|
173
|
+
ignore_valid_prefix=ignore_valid_prefix,
|
|
174
|
+
randomize=randomize,
|
|
175
|
+
only_parents=only_parents,
|
|
176
|
+
)
|
|
177
|
+
else:
|
|
178
|
+
print("ending...")
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import ohmyscrapper.models.urls_manager as urls_manager
|
|
2
|
+
import math
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
from rich.table import Table
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
|
|
8
|
+
df = urls_manager.get_urls(limit=limit)
|
|
9
|
+
|
|
10
|
+
if simplify:
|
|
11
|
+
df.drop(columns=["description", "json"], inplace=True)
|
|
12
|
+
|
|
13
|
+
df.to_csv(csv_file, index=False)
|
|
14
|
+
print("--------------------")
|
|
15
|
+
print("Urls exported to", csv_file)
|
|
16
|
+
|
|
17
|
+
df.replace(
|
|
18
|
+
{
|
|
19
|
+
"description": {r"\n": " "},
|
|
20
|
+
},
|
|
21
|
+
regex=True,
|
|
22
|
+
inplace=True,
|
|
23
|
+
)
|
|
24
|
+
df.to_html(csv_file + "-preview.html", index=False)
|
|
25
|
+
print("Urls preview exported to", csv_file + "-preview.html")
|
|
26
|
+
print("--------------------")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def export_report(csv_file="output/report.csv"):
|
|
30
|
+
df = urls_manager.get_urls_report()
|
|
31
|
+
|
|
32
|
+
df.to_csv(csv_file, index=False)
|
|
33
|
+
_clear_file(csv_file)
|
|
34
|
+
print("--------------------")
|
|
35
|
+
print("Urls report exported to", csv_file)
|
|
36
|
+
|
|
37
|
+
df.replace(
|
|
38
|
+
{
|
|
39
|
+
"description": {r"\n": " "},
|
|
40
|
+
},
|
|
41
|
+
regex=True,
|
|
42
|
+
inplace=True,
|
|
43
|
+
)
|
|
44
|
+
df.to_html(csv_file + "-preview.html", index=False)
|
|
45
|
+
_clear_file(csv_file + "-preview.html")
|
|
46
|
+
|
|
47
|
+
print("Urls report preview exported to", csv_file + "-preview.html")
|
|
48
|
+
print("--------------------")
|
|
49
|
+
|
|
50
|
+
# TODO: Add transformation layer
|
|
51
|
+
def _clear_file(txt_tile):
|
|
52
|
+
with open(txt_tile, "r") as f:
|
|
53
|
+
content = f.read()
|
|
54
|
+
content = content.replace(" - -", " -")
|
|
55
|
+
content = content.replace(" -<", "<")
|
|
56
|
+
with open(txt_tile, "w") as f:
|
|
57
|
+
f.write(content)
|
|
58
|
+
|
|
59
|
+
def show_urls(limit=0, jump_to_page=0):
|
|
60
|
+
df = urls_manager.get_urls(limit=limit)
|
|
61
|
+
df.drop(columns=["json", "description"], inplace=True)
|
|
62
|
+
# df = df.head(n=20)
|
|
63
|
+
|
|
64
|
+
# https://medium.com/@inzaniak/create-tables-in-your-terminal-with-python-6747d68d71a6
|
|
65
|
+
|
|
66
|
+
total_items = len(df)
|
|
67
|
+
items_per_page = 15
|
|
68
|
+
n_pages = math.ceil(total_items / items_per_page)
|
|
69
|
+
|
|
70
|
+
last_popped = 0
|
|
71
|
+
for page in range(n_pages):
|
|
72
|
+
|
|
73
|
+
df_page = df.head(n=items_per_page)
|
|
74
|
+
df_t = df.T
|
|
75
|
+
for i in range(items_per_page):
|
|
76
|
+
if last_popped < total_items:
|
|
77
|
+
df_t.pop(last_popped)
|
|
78
|
+
last_popped += 1
|
|
79
|
+
df = df_t.T
|
|
80
|
+
if page < jump_to_page:
|
|
81
|
+
continue
|
|
82
|
+
show_table(df_page)
|
|
83
|
+
|
|
84
|
+
print("Page", page + 1, "of", n_pages)
|
|
85
|
+
user_input = input("Press enter to continue or type q to quit: ")
|
|
86
|
+
if user_input == "q":
|
|
87
|
+
break
|
|
88
|
+
if user_input.isnumeric():
|
|
89
|
+
jump_to_page = math.ceil(int(user_input))
|
|
90
|
+
if jump_to_page > n_pages or jump_to_page < 1:
|
|
91
|
+
print("This page does not exist")
|
|
92
|
+
jump_to_page = 0
|
|
93
|
+
else:
|
|
94
|
+
jump_to_page = jump_to_page - 1
|
|
95
|
+
if page < jump_to_page:
|
|
96
|
+
continue
|
|
97
|
+
elif jump_to_page >= 0:
|
|
98
|
+
show_urls(limit=limit, jump_to_page=jump_to_page)
|
|
99
|
+
break
|
|
100
|
+
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# TODO: Change place
|
|
107
|
+
def show_table(df):
|
|
108
|
+
columns = df.columns.tolist()
|
|
109
|
+
df = df.to_dict(orient="records")
|
|
110
|
+
table = Table(show_header=True, header_style="bold magenta")
|
|
111
|
+
for column in columns:
|
|
112
|
+
table.add_column(column)
|
|
113
|
+
|
|
114
|
+
for row in df:
|
|
115
|
+
table.add_row(*[str(value) for value in row.values()])
|
|
116
|
+
console = Console()
|
|
117
|
+
console.print(table)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def show_urls_valid_prefix(limit=0):
|
|
121
|
+
print(urls_manager.get_urls_valid_prefix(limit=limit))
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def show_url(url):
|
|
126
|
+
print(urls_manager.get_url_by_url(url=url).T)
|
|
127
|
+
return
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from bs4 import BeautifulSoup
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def sniff_url(url="https://www.linkedin.com/in/cesardesouzacardoso/", silent=False):
|
|
7
|
+
if not silent:
|
|
8
|
+
print("checking url:", url)
|
|
9
|
+
report_meta_tags = []
|
|
10
|
+
tags_to_search = [
|
|
11
|
+
"description",
|
|
12
|
+
"og:url",
|
|
13
|
+
"og:title",
|
|
14
|
+
"og:description",
|
|
15
|
+
"og:type",
|
|
16
|
+
"lnkd:url",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
r = requests.get(url=url)
|
|
20
|
+
soup = BeautifulSoup(r.text, "html.parser")
|
|
21
|
+
|
|
22
|
+
if not silent:
|
|
23
|
+
print("\n\n\n\n---- all <meta> tags ---\n")
|
|
24
|
+
i = 0
|
|
25
|
+
for meta_tag in soup.find_all("meta"):
|
|
26
|
+
if (
|
|
27
|
+
meta_tag.get("name") in tags_to_search
|
|
28
|
+
or meta_tag.get("property") in tags_to_search
|
|
29
|
+
):
|
|
30
|
+
report_meta_tags.append(meta_tag)
|
|
31
|
+
i = i + 1
|
|
32
|
+
if not silent:
|
|
33
|
+
print("-- meta tag", i, "--")
|
|
34
|
+
print("name:", meta_tag.get("name"))
|
|
35
|
+
print("property:", meta_tag.get("property"))
|
|
36
|
+
print("content:", meta_tag.get("content"))
|
|
37
|
+
print("---------------- \n")
|
|
38
|
+
|
|
39
|
+
if not silent:
|
|
40
|
+
print("\n\n\n\n---- all <a> links ---")
|
|
41
|
+
i = 0
|
|
42
|
+
for a_tag in soup.find_all("a"):
|
|
43
|
+
i = i + 1
|
|
44
|
+
print("\n-- a link", i, "-- ")
|
|
45
|
+
print("target:", a_tag.get("target"))
|
|
46
|
+
print("text:", a_tag.text)
|
|
47
|
+
print("href:", a_tag.get("href"))
|
|
48
|
+
print("-------------- ")
|
|
49
|
+
|
|
50
|
+
final_report = {}
|
|
51
|
+
final_report["scrapped-url"] = url
|
|
52
|
+
if len(soup.find_all("h1")) > 0:
|
|
53
|
+
final_report["h1"] = soup.find("h1").text
|
|
54
|
+
|
|
55
|
+
for report_meta_tag in report_meta_tags:
|
|
56
|
+
if report_meta_tag.get("name") is not None:
|
|
57
|
+
final_report[report_meta_tag.get("name")] = report_meta_tag.get("content")
|
|
58
|
+
elif report_meta_tag.get("property") is not None:
|
|
59
|
+
final_report[report_meta_tag.get("property")] = report_meta_tag.get(
|
|
60
|
+
"content"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if len(soup.find_all("a")) > 0:
|
|
64
|
+
final_report["first-a-link"] = soup.find("a").get("href")
|
|
65
|
+
final_report["total-a-links"] = len(soup.find_all("a"))
|
|
66
|
+
else:
|
|
67
|
+
final_report["first-a-link"] = ""
|
|
68
|
+
final_report["total-a-links"] = 0
|
|
69
|
+
|
|
70
|
+
if len(soup.find_all("h2")) > 0:
|
|
71
|
+
final_report["h2"] = soup.find("h2").text
|
|
72
|
+
|
|
73
|
+
if len(soup.find_all("meta")) > 0:
|
|
74
|
+
final_report["total-meta-tags"] = len(soup.find_all("meta"))
|
|
75
|
+
else:
|
|
76
|
+
final_report["total-meta-tags"] = 0
|
|
77
|
+
|
|
78
|
+
final_report["json"] = json.dumps(final_report)
|
|
79
|
+
if not silent:
|
|
80
|
+
print("\n\n\n----report---\n")
|
|
81
|
+
for key in final_report:
|
|
82
|
+
print("* ", key, ":", final_report[key])
|
|
83
|
+
|
|
84
|
+
return final_report
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_tags(url):
|
|
88
|
+
return sniff_url(url=url, silent=True)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: ohmyscrapper
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: This project aims to create a text-based scraper containing links to create a final PDF with general information about job openings.
|
|
5
|
+
Author: Cesar Cardoso gh@bouli
|
|
6
|
+
Author-email: Cesar Cardoso gh@bouli <hello@cesarcardoso.cc>
|
|
7
|
+
Requires-Dist: beautifulsoup4>=4.14.3
|
|
8
|
+
Requires-Dist: google-genai>=1.55.0
|
|
9
|
+
Requires-Dist: markdown>=3.10
|
|
10
|
+
Requires-Dist: pandas>=2.3.3
|
|
11
|
+
Requires-Dist: python-dotenv>=1.2.1
|
|
12
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
13
|
+
Requires-Dist: requests>=2.32.5
|
|
14
|
+
Requires-Dist: rich>=14.2.0
|
|
15
|
+
Requires-Dist: urlextract>=1.9.0
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# OhMyScrapper - v0.2.1
|
|
20
|
+
|
|
21
|
+
This project aims to create a text-based scraper containing links to create a
|
|
22
|
+
final PDF with general information about job openings.
|
|
23
|
+
|
|
24
|
+
> This project is using [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer) by default.
|
|
25
|
+
|
|
26
|
+
## Scope
|
|
27
|
+
|
|
28
|
+
- Read texts;
|
|
29
|
+
- Extract links;
|
|
30
|
+
- Use meta og:tags to extract information;
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
I recomend to use the [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer), so you can just use the command bellow and everything is installed:
|
|
35
|
+
```shell
|
|
36
|
+
uv sync
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## How to use and test (development only)
|
|
40
|
+
|
|
41
|
+
OhMyScrapper works in 3 stages:
|
|
42
|
+
|
|
43
|
+
1. It collects and loads urls from a text (by default `input/_chat.txt`) in a database;
|
|
44
|
+
2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
|
|
45
|
+
3. Export a list of urls in CSV files;
|
|
46
|
+
|
|
47
|
+
You can do 3 stages with the command:
|
|
48
|
+
```shell
|
|
49
|
+
make start
|
|
50
|
+
```
|
|
51
|
+
> Remember to add your text file in the folder `/input` with the name `_chat.txt`!
|
|
52
|
+
|
|
53
|
+
You will find the exported files in the folder `/output` like this:
|
|
54
|
+
- `/output/report.csv`
|
|
55
|
+
- `/output/report.csv-preview.html`
|
|
56
|
+
- `/output/urls-simplified.csv`
|
|
57
|
+
- `/output/urls-simplified.csv-preview.html`
|
|
58
|
+
- `/output/urls.csv`
|
|
59
|
+
- `/output/urls.csv-preview.html`
|
|
60
|
+
|
|
61
|
+
### BUT: if you want to do step by step, here it is:
|
|
62
|
+
|
|
63
|
+
First we load a text file you would like to look for urls, the idea here is to
|
|
64
|
+
use the whatsapp history, but it works with any txt file.
|
|
65
|
+
|
|
66
|
+
The default file is `input/_chat.txt`. If you have the default file you just use
|
|
67
|
+
the command `load`:
|
|
68
|
+
```shell
|
|
69
|
+
make load
|
|
70
|
+
```
|
|
71
|
+
or, if you have another file, just use the argument `-file` like this:
|
|
72
|
+
```shell
|
|
73
|
+
uv run main.py load -file=my-text-file.txt
|
|
74
|
+
```
|
|
75
|
+
That will create a database if it doesn't exist and store every url the oh-my-scrapper
|
|
76
|
+
find. After that, let's scrap the urls with the command `scrap-urls`:
|
|
77
|
+
|
|
78
|
+
```shell
|
|
79
|
+
make scrap-urls
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
That will scrap only the linkedin urls we are interested in. For now they are:
|
|
83
|
+
- linkedin_post: https://%.linkedin.com/posts/%
|
|
84
|
+
- linkedin_redirect: https://lnkd.in/%
|
|
85
|
+
- linkedin_job: https://%.linkedin.com/jobs/view/%
|
|
86
|
+
- linkedin_feed" https://%.linkedin.com/feed/%
|
|
87
|
+
- linkedin_company: https://%.linkedin.com/company/%
|
|
88
|
+
|
|
89
|
+
But we can use every other one generically using the argument `--ignore-type`:
|
|
90
|
+
```shell
|
|
91
|
+
uv run main.py scrap-urls --ignore-type
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
And we can ask to make it recursively adding the argument `--recursive`:
|
|
95
|
+
```shell
|
|
96
|
+
uv run main.py scrap-urls --recursive
|
|
97
|
+
```
|
|
98
|
+
> !!! important: we are not sure about blocks we can have for excess of requests
|
|
99
|
+
|
|
100
|
+
And we can finally export with the command:
|
|
101
|
+
```shell
|
|
102
|
+
make export
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
That's the basic usage!
|
|
107
|
+
But you can understand more using the help:
|
|
108
|
+
```shell
|
|
109
|
+
uv run main.py --help
|
|
110
|
+
```
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
ohmyscrapper/__init__.py,sha256=OOoRFtkBKaTIf74FStI0MGtk-LUQOuN0QnBZRfRWauA,5145
|
|
2
|
+
ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
|
|
3
|
+
ohmyscrapper/models/urls_manager.py,sha256=xKql_xdwfRwgpMyriuIrZ0Srz4gYQGMfWClEWpGRtNE,11183
|
|
4
|
+
ohmyscrapper/modules/classify_urls.py,sha256=eyHtTHDZp2pGmYw_X-7LrbeVOgDPcRQdhu0oEuwQtKA,743
|
|
5
|
+
ohmyscrapper/modules/load_txt.py,sha256=mL60OGsh-R80P88vxyqvfBEFag9yhSFFbg5pwtu1f90,889
|
|
6
|
+
ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
|
|
7
|
+
ohmyscrapper/modules/process_with_ai.py,sha256=TpumucIVNZulKOw2idy4hD3vG5IhG5pbhyJImYFP8g0,6844
|
|
8
|
+
ohmyscrapper/modules/scrap_urls.py,sha256=KQVs3R03X80hmvvJAU1SqnNhwXEeVV99WlN8TxSKqA8,6097
|
|
9
|
+
ohmyscrapper/modules/seed.py,sha256=KeTSbmTdNTkVCtzk9iQmeuEqB0kG-rTZJb2a1WdROL4,129
|
|
10
|
+
ohmyscrapper/modules/show.py,sha256=u0L9uxgU8Xt_-myA3r7byuOmnX_-2gkpTtXWkXon1ns,3572
|
|
11
|
+
ohmyscrapper/modules/sniff_url.py,sha256=jQDc7aSimuOOedw2fSXZlf6_o0OqQHOr6NsWb4n0XgI,2720
|
|
12
|
+
ohmyscrapper/modules/untouch_all.py,sha256=E1U9e3sOG7suzc8ZTWcYiQQo9mPmLJ0piXdXUjFLEd4,162
|
|
13
|
+
ohmyscrapper-0.2.1.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
|
|
14
|
+
ohmyscrapper-0.2.1.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
|
|
15
|
+
ohmyscrapper-0.2.1.dist-info/METADATA,sha256=Sl1HuVlxTSSAYz9ga0zJ9xUpWGY2NZOkNu1xTNtGUu8,3411
|
|
16
|
+
ohmyscrapper-0.2.1.dist-info/RECORD,,
|