ohmyscrapper 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,162 @@
1
+ import argparse
2
+
3
+ from ohmyscrapper.modules.classify_urls import classify_urls
4
+ from ohmyscrapper.modules.sniff_url import sniff_url
5
+ from ohmyscrapper.modules.load_txt import load_txt
6
+ from ohmyscrapper.modules.seed import seed
7
+ from ohmyscrapper.modules.scrap_urls import scrap_urls
8
+ from ohmyscrapper.modules.show import (
9
+ show_url,
10
+ show_urls,
11
+ show_urls_valid_prefix,
12
+ export_urls,
13
+ export_report,
14
+ )
15
+ from ohmyscrapper.modules.untouch_all import untouch_all
16
+ from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
17
+ from ohmyscrapper.modules.merge_dbs import merge_dbs
18
+
19
+
20
+ def main():
21
+ parser = argparse.ArgumentParser(prog="ohmyscrapper")
22
+ parser.add_argument("--version", action="version", version="%(prog)s v0.2.1")
23
+
24
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
25
+
26
+ ai_process_parser = subparsers.add_parser(
27
+ "process-with-ai", help="Process with AI."
28
+ )
29
+ ai_process_parser.add_argument(
30
+ "--history", default=False, help="Reprocess ai history", action="store_true"
31
+ )
32
+
33
+ seed_parser = subparsers.add_parser(
34
+ "seed", help="Seed database. Necessary to classify urls."
35
+ )
36
+ untouch_parser = subparsers.add_parser(
37
+ "untouch-all", help="Untouch all urls. That resets classification"
38
+ )
39
+
40
+ classify_urls_parser = subparsers.add_parser(
41
+ "classify-urls", help="Classify loaded urls"
42
+ )
43
+ classify_urls_parser.add_argument(
44
+ "--recursive", default=False, help="Run in recursive mode", action="store_true"
45
+ )
46
+
47
+ load_txt_parser = subparsers.add_parser("load", help="Load txt file")
48
+ load_txt_parser.add_argument(
49
+ "-file", default="input/_chat.txt", help="File path. Default is input/_chat.txt"
50
+ )
51
+
52
+ scrap_urls_parser = subparsers.add_parser("scrap-urls", help="Scrap urls")
53
+ scrap_urls_parser.add_argument(
54
+ "--recursive", default=False, help="Run in recursive mode", action="store_true"
55
+ )
56
+ scrap_urls_parser.add_argument(
57
+ "--ignore-type", default=False, help="Ignore urls types", action="store_true"
58
+ )
59
+ scrap_urls_parser.add_argument(
60
+ "--randomize", default=False, help="Random order", action="store_true"
61
+ )
62
+ scrap_urls_parser.add_argument(
63
+ "--only-parents", default=False, help="Only parents urls", action="store_true"
64
+ )
65
+
66
+ sniff_url_parser = subparsers.add_parser("sniff-url", help="Check url")
67
+ sniff_url_parser.add_argument(
68
+ "url", default="https://cesarcardoso.cc/", help="Url to sniff"
69
+ )
70
+
71
+ show_urls_parser = subparsers.add_parser("show", help="Show urls and prefixes")
72
+ show_urls_parser.add_argument(
73
+ "--prefixes", default=False, help="Show urls valid prefix", action="store_true"
74
+ )
75
+ show_urls_parser.add_argument("--limit", default=0, help="Limit of lines to show")
76
+ show_urls_parser.add_argument("-url", default="", help="Url to show")
77
+
78
+ export_parser = subparsers.add_parser("export", help="Export urls to csv.")
79
+ export_parser.add_argument("--limit", default=0, help="Limit of lines to export")
80
+ export_parser.add_argument(
81
+ "--file",
82
+ default="output/urls.csv",
83
+ help="File path. Default is output/urls.csv",
84
+ )
85
+ export_parser.add_argument(
86
+ "--simplify",
87
+ default=False,
88
+ help="Ignore json and descriptions",
89
+ action="store_true",
90
+ )
91
+
92
+ report_parser = subparsers.add_parser("report", help="Export urls report to csv.")
93
+ merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
94
+
95
+ # TODO: What is that?
96
+ # seed_parser.set_defaults(func=seed)
97
+ # classify_urls_parser.set_defaults(func=classify_urls)
98
+ # load_txt_parser.set_defaults(func=load_txt)
99
+
100
+ args = parser.parse_args()
101
+
102
+ if args.command == "classify-urls":
103
+ classify_urls(args.recursive)
104
+ return
105
+
106
+ if args.command == "load":
107
+ load_txt(args.file)
108
+ return
109
+
110
+ if args.command == "seed":
111
+ seed()
112
+ return
113
+
114
+ if args.command == "untouch-all":
115
+ untouch_all()
116
+ return
117
+
118
+ if args.command == "sniff-url":
119
+ sniff_url(args.url)
120
+ return
121
+
122
+ if args.command == "scrap-urls":
123
+ scrap_urls(
124
+ recursive=args.recursive,
125
+ ignore_valid_prefix=args.ignore_type,
126
+ randomize=args.randomize,
127
+ only_parents=args.only_parents,
128
+ )
129
+ return
130
+
131
+ if args.command == "show":
132
+ if args.prefixes:
133
+ show_urls_valid_prefix(int(args.limit))
134
+ return
135
+ if args.url != "":
136
+ show_url(args.url)
137
+ return
138
+ show_urls(int(args.limit))
139
+ return
140
+
141
+ if args.command == "export":
142
+ export_urls(limit=int(args.limit), csv_file=args.file, simplify=args.simplify)
143
+ return
144
+
145
+ if args.command == "process-with-ai":
146
+ if args.history:
147
+ reprocess_ai_history()
148
+ else:
149
+ process_with_ai()
150
+ return
151
+
152
+ if args.command == "report":
153
+ export_report()
154
+ return
155
+
156
+ if args.command == "merge_dbs":
157
+ merge_dbs()
158
+ return
159
+
160
+
161
+ if __name__ == "__main__":
162
+ main()
@@ -0,0 +1,4 @@
1
+ from . import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
@@ -0,0 +1,400 @@
1
+ import os
2
+ import sqlite3
3
+ import time
4
+ import glob
5
+ import pandas as pd
6
+ from urllib.parse import urlparse, urlunparse
7
+
8
+
9
+ def get_db_dir():
10
+ if not os.path.exists("db"):
11
+ os.mkdir("db")
12
+ return "db"
13
+
14
+
15
+ def get_db_path():
16
+ return get_db_dir() + "/local.db"
17
+
18
+
19
+ def get_db_connection():
20
+ return sqlite3.connect(get_db_path())
21
+
22
+
23
+ # TODO: check if it makes sense
24
+ conn = get_db_connection()
25
+
26
+
27
+ def create_tables():
28
+
29
+ c = conn.cursor()
30
+ c.execute(
31
+ "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, h1 TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
32
+ )
33
+ c.execute(
34
+ "CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, prompt_file STRING, prompt_name STRING, created_at DATETIME)"
35
+ )
36
+
37
+ c.execute(
38
+ "CREATE TABLE IF NOT EXISTS urls_valid_prefix (id INTEGER PRIMARY KEY, url_prefix TEXT UNIQUE, url_type TEXT)"
39
+ )
40
+
41
+ return pd.read_sql_query("SELECT * FROM urls LIMIT 100", conn)
42
+
43
+
44
+ # TODO: not sure this should be something. depends on the project
45
+ def seeds():
46
+ create_tables()
47
+
48
+ add_urls_valid_prefix("https://%.linkedin.com/posts/%", "linkedin_post")
49
+ add_urls_valid_prefix("https://lnkd.in/%", "linkedin_redirect")
50
+ add_urls_valid_prefix("https://%.linkedin.com/jobs/view/%", "linkedin_job")
51
+ add_urls_valid_prefix("https://%.linkedin.com/feed/%", "linkedin_feed")
52
+ add_urls_valid_prefix("https://%.linkedin.com/company/%", "linkedin_company")
53
+
54
+ # add_urls_valid_prefix("%.pdf", "pdf")
55
+ # add_url('https://imazon.org.br/categorias/artigos-cientificos/')
56
+
57
+ return True
58
+
59
+
60
+ def add_urls_valid_prefix(url_prefix, url_type):
61
+ conn = get_db_connection()
62
+
63
+ df = pd.read_sql_query(
64
+ f"SELECT * FROM urls_valid_prefix WHERE url_prefix = '{url_prefix}'", conn
65
+ )
66
+ if len(df) == 0:
67
+ c = conn.cursor()
68
+ c.execute(
69
+ "INSERT INTO urls_valid_prefix (url_prefix, url_type) VALUES (?, ?)",
70
+ (url_prefix, url_type),
71
+ )
72
+ conn.commit()
73
+
74
+
75
+ def get_urls_valid_prefix_by_type(url_type):
76
+ df = pd.read_sql_query(
77
+ f"SELECT * FROM urls_valid_prefix WHERE url_type = '{url_type}'", conn
78
+ )
79
+ return df
80
+
81
+
82
+ def get_urls_valid_prefix_by_id(id):
83
+ df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix WHERE id = '{id}'", conn)
84
+ return df
85
+
86
+
87
+ # TODO: pagination required
88
+ def get_urls_valid_prefix(limit=0):
89
+ if limit > 0:
90
+ df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix LIMIT {limit}", conn)
91
+ else:
92
+ df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix", conn)
93
+ return df
94
+
95
+
96
+ # TODO: pagination required
97
+ def get_urls(limit=0):
98
+ if limit > 0:
99
+ df = pd.read_sql_query(
100
+ f"SELECT * FROM urls LIMIT {limit} ORDER BY history ASC", conn
101
+ )
102
+ else:
103
+ df = pd.read_sql_query(f"SELECT * FROM urls ORDER BY history ASC", conn)
104
+ return df
105
+
106
+
107
+ def get_urls_report():
108
+ sql = """
109
+ WITH parent_url AS (
110
+ SELECT parent_url FROM urls WHERE parent_url IS NOT NULL AND parent_url != '' GROUP BY parent_url
111
+ ),
112
+ parents AS (
113
+ SELECT
114
+ u.id,
115
+ u.url,
116
+ u.h1
117
+ FROM urls u
118
+ INNER JOIN parent_url p
119
+ ON u.url = p.parent_url
120
+ )
121
+ SELECT
122
+ u.id,
123
+ u.url_type,
124
+ u.url,
125
+ COALESCE(u.h1, p.h1) as h1,
126
+ p.url as parent_url,
127
+ p.h1 as parent_h1
128
+ FROM urls u
129
+ LEFT JOIN parents p
130
+ ON u.parent_url = p.url
131
+ WHERE
132
+ u.history = 0
133
+ AND u.url NOT IN (SELECT url FROM parents)
134
+ ORDER BY url_type DESC
135
+ """
136
+ df = pd.read_sql_query(sql, conn)
137
+
138
+ return df
139
+
140
+
141
+ def get_url_by_url(url):
142
+ url = clean_url(url)
143
+ df = pd.read_sql_query(f"SELECT * FROM urls WHERE url = '{url}'", conn)
144
+
145
+ return df
146
+
147
+
148
+ def get_url_by_id(id):
149
+ df = pd.read_sql_query(f"SELECT * FROM urls WHERE id = '{id}'", conn)
150
+
151
+ return df
152
+
153
+
154
+ def get_urls_by_url_type(url_type):
155
+ df = pd.read_sql_query(
156
+ f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}'", conn
157
+ )
158
+ return df
159
+
160
+
161
+ def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
162
+ df = pd.read_sql_query(
163
+ f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}' AND ai_processed = 0 LIMIT {limit}",
164
+ conn,
165
+ )
166
+ return df
167
+
168
+
169
+ def get_url_like_unclassified(like_condition):
170
+ df = pd.read_sql_query(
171
+ f"SELECT * FROM urls WHERE history = 0 AND url LIKE '{like_condition}' AND url_type IS NULL",
172
+ conn,
173
+ )
174
+ return df
175
+
176
+
177
+ def add_url(url, h1=None, parent_url=None):
178
+ url = clean_url(url)
179
+ c = conn.cursor()
180
+
181
+ if h1 is not None:
182
+ h1 = h1.strip()
183
+
184
+ if parent_url is None:
185
+ parent_url = None
186
+
187
+ parent_url = str(parent_url)
188
+
189
+ if len(get_url_by_url(url)) == 0:
190
+ c.execute(
191
+ "INSERT INTO urls (url, h1, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
192
+ (url, h1, parent_url, int(time.time())),
193
+ )
194
+ conn.commit()
195
+
196
+ return get_url_by_url(url)
197
+
198
+
199
+ def add_ai_log(instructions, response, model, prompt_file, prompt_name):
200
+ c = conn.cursor()
201
+
202
+ c.execute(
203
+ "INSERT INTO ai_log (instructions, response, model, prompt_file, prompt_name, created_at) VALUES (?, ?, ?, ?, ?, ?)",
204
+ (instructions, response, model, prompt_file, prompt_name, int(time.time())),
205
+ )
206
+ conn.commit()
207
+
208
+ def get_ai_log():
209
+ df = pd.read_sql_query(f"SELECT * FROM ai_log", conn)
210
+ return df
211
+
212
+ def set_url_destiny(url, destiny):
213
+ url = clean_url(url)
214
+ destiny = clean_url(destiny)
215
+ c = conn.cursor()
216
+ c.execute("UPDATE urls SET url_destiny = ? WHERE url = ?", (destiny, url))
217
+ c.execute(
218
+ "UPDATE urls SET parent_url = ? WHERE url = ?",
219
+ (str(url), destiny),
220
+ )
221
+
222
+ conn.commit()
223
+
224
+
225
+ def set_url_h1(url, value):
226
+ value = str(value).strip()
227
+ url = clean_url(url)
228
+ c = conn.cursor()
229
+ c.execute("UPDATE urls SET h1 = ? WHERE url = ?", (value, url))
230
+ conn.commit()
231
+
232
+
233
+ def set_url_h1_by_id(id, value):
234
+ value = str(value).strip()
235
+
236
+ c = conn.cursor()
237
+ c.execute("UPDATE urls SET h1 = ? WHERE id = ?", (value, id))
238
+ conn.commit()
239
+
240
+
241
+ def set_url_ai_processed_by_id(id, json_str):
242
+ value = 1
243
+ value = str(value).strip()
244
+ c = conn.cursor()
245
+ c.execute("UPDATE urls SET ai_processed = ? , json_ai = ? WHERE id = ?", (value, json_str, id))
246
+ conn.commit()
247
+
248
+ def set_url_empty_ai_processed_by_id(id, json_str="empty result"):
249
+ value = 1
250
+ value = str(value).strip()
251
+ c = conn.cursor()
252
+ c.execute("UPDATE urls SET ai_processed = ? , json_ai = ? WHERE ai_processed = 0 AND id = ?", (value, json_str, id))
253
+ conn.commit()
254
+
255
+ def set_url_ai_processed_by_url(url, json_str):
256
+ value = 1
257
+ value = str(value).strip()
258
+ url = clean_url(url)
259
+ c = conn.cursor()
260
+ c.execute("UPDATE urls SET ai_processed = ?, json_ai = ? WHERE url = ?", (value, json_str, url))
261
+ conn.commit()
262
+
263
+
264
+ def set_url_description(url, value):
265
+ url = clean_url(url)
266
+ c = conn.cursor()
267
+ c.execute("UPDATE urls SET description = ? WHERE url = ?", (value, url))
268
+ conn.commit()
269
+
270
+
271
+ def set_url_description_links(url, value):
272
+ url = clean_url(url)
273
+ c = conn.cursor()
274
+ c.execute("UPDATE urls SET description_links = ? WHERE url = ?", (value, url))
275
+ conn.commit()
276
+
277
+
278
+ def set_url_json(url, value):
279
+ url = clean_url(url)
280
+ c = conn.cursor()
281
+ c.execute("UPDATE urls SET json = ? WHERE url = ?", (value, url))
282
+ conn.commit()
283
+
284
+
285
+ def set_url_error(url, value):
286
+ url = clean_url(url)
287
+ c = conn.cursor()
288
+ c.execute("UPDATE urls SET error = ? WHERE url = ?", (value, url))
289
+ conn.commit()
290
+
291
+
292
+ def set_url_type_by_id(url_id, url_type):
293
+ c = conn.cursor()
294
+ c.execute(f"UPDATE urls SET url_type = '{url_type}' WHERE id = {url_id}")
295
+ conn.commit()
296
+
297
+
298
+ def clean_url(url):
299
+ if url[0:7] == "http://":
300
+ url = "https://" + url[7:]
301
+
302
+ if url[0:8] != "https://":
303
+ url = "https://" + url
304
+ url = url.split("#")[0]
305
+ old_query = urlparse(url).query.split("&")
306
+ new_query = []
307
+ for i in old_query:
308
+ if i[0:4] != "utm_":
309
+ new_query.append(i)
310
+
311
+ url = urlunparse(urlparse(url)._replace(query="&".join(new_query))).replace("'", "")
312
+ return url
313
+
314
+
315
+ def get_untouched_urls(
316
+ limit=10, randomize=True, ignore_valid_prefix=False, only_parents=True
317
+ ):
318
+ where_sql = ""
319
+ if not ignore_valid_prefix:
320
+ where_sql += " AND url_type IS NOT NULL "
321
+
322
+ if only_parents:
323
+ where_sql += " AND (parent_url = '' OR parent_url IS NULL) "
324
+
325
+ if randomize:
326
+ random_sql = " RANDOM() "
327
+ else:
328
+ random_sql = " created_at DESC "
329
+ sql = f"SELECT * FROM urls WHERE 1 = 1 AND history = 0 {where_sql} AND last_touch IS NULL ORDER BY {random_sql} LIMIT {limit}"
330
+ df = pd.read_sql_query(sql, conn)
331
+ return df
332
+
333
+
334
+ def touch_url(url):
335
+ url = clean_url(url)
336
+ c = conn.cursor()
337
+ c.execute("UPDATE urls SET last_touch = ? WHERE url = ?", (int(time.time()), url))
338
+ conn.commit()
339
+
340
+
341
+ def untouch_url(url):
342
+ url = clean_url(url)
343
+ c = conn.cursor()
344
+ c.execute("UPDATE urls SET last_touch = NULL WHERE url = ?", (url))
345
+ conn.commit()
346
+
347
+
348
+ def untouch_all_urls():
349
+ c = conn.cursor()
350
+ c.execute("UPDATE urls SET last_touch = NULL WHERE history = 0")
351
+ conn.commit()
352
+
353
+
354
+ def set_all_urls_as_history():
355
+ c = conn.cursor()
356
+ c.execute("UPDATE urls SET history = 1")
357
+ conn.commit()
358
+
359
+
360
+ def merge_dbs() -> None:
361
+ production_db_file = get_db_path()
362
+ db_number = -1
363
+ dir = get_db_dir()
364
+ list_of_files = glob.glob(dir + "/*.db")
365
+ list_of_files.remove(production_db_file)
366
+ if len(list_of_files) > 0:
367
+ print("\nAvailable dbs:")
368
+ for index, file in enumerate(list_of_files):
369
+ print(index, ":", file)
370
+ while db_number < 0 or db_number >= len(list_of_files):
371
+ db_number = int(input("Choose the db to merge: "))
372
+
373
+ print(list_of_files[db_number])
374
+ source_conn = sqlite3.connect(list_of_files[db_number])
375
+ df = pd.read_sql_query("SELECT * FROM urls", source_conn)
376
+ for index, row in df.iterrows():
377
+ merge_url(
378
+ row["url"],
379
+ f"merged from {list_of_files[db_number]}",
380
+ row["last_touch"],
381
+ row["created_at"],
382
+ row["description"],
383
+ row["json"],
384
+ )
385
+ # ßmerge_url(df)
386
+
387
+
388
+ def merge_url(url, h1, last_touch, created_at, description, json):
389
+ url = clean_url(url)
390
+ c = conn.cursor()
391
+
392
+ if h1 is not None:
393
+ h1 = h1.strip()
394
+
395
+ if len(get_url_by_url(url)) == 0:
396
+ c.execute(
397
+ "INSERT INTO urls (url, h1, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
398
+ (url, h1, last_touch, created_at, description, json),
399
+ )
400
+ conn.commit()
@@ -0,0 +1,23 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+ import pandas as pd
3
+ import time
4
+
5
+
6
+ def classify_urls(recursive=False):
7
+ urls_manager.seeds()
8
+ df = urls_manager.get_urls_valid_prefix()
9
+
10
+ keep_alive = True
11
+ while keep_alive:
12
+ print("waking up!")
13
+ for index, row_prefix in df.iterrows():
14
+ df_urls = urls_manager.get_url_like_unclassified(like_condition=row_prefix["url_prefix"])
15
+ for index, row_urls in df_urls.iterrows():
16
+ urls_manager.set_url_type_by_id(url_id =row_urls["id"], url_type=row_prefix["url_type"])
17
+
18
+ if not recursive:
19
+ print("ending...")
20
+ keep_alive = False
21
+ else:
22
+ print("sleeping...")
23
+ time.sleep(10)
@@ -0,0 +1,32 @@
1
+ import os
2
+ from urlextract import URLExtract
3
+ import ohmyscrapper.models.urls_manager as urls_manager
4
+
5
+
6
+ def load_txt(file_name="input/_chat.txt"):
7
+
8
+ if not os.path.exists("input"):
9
+ os.mkdir("input")
10
+
11
+ urls_manager.create_tables()
12
+ urls_manager.seeds()
13
+ # make it recursive for all files
14
+ text_file_content = open(file_name, "r").read()
15
+
16
+ put_urls_from_string(text_to_process=text_file_content)
17
+
18
+ # move_it_to_processed
19
+ print("--------------------")
20
+ print(file_name, "processed")
21
+
22
+
23
+ def put_urls_from_string(text_to_process, parent_url=None):
24
+ if isinstance(text_to_process, str):
25
+ extractor = URLExtract()
26
+ for url in extractor.find_urls(text_to_process):
27
+ urls_manager.add_url(url=url, parent_url=parent_url)
28
+ print(url, "added")
29
+
30
+ return len(extractor.find_urls(text_to_process))
31
+ else:
32
+ return 0
@@ -0,0 +1,6 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+
3
+
4
+ def merge_dbs():
5
+ urls_manager.merge_dbs()
6
+ return
@@ -0,0 +1,197 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+ from bs4 import BeautifulSoup
3
+ from google import genai
4
+ from dotenv import load_dotenv
5
+ import random
6
+ import time
7
+ import os
8
+ import yaml
9
+ import json
10
+ # TODO: !!! REFACTOR !!!
11
+ load_dotenv()
12
+
13
+ def reprocess_ai_history():
14
+ df = urls_manager.get_ai_log().to_dict(orient="records")
15
+ for row in df:
16
+ process_ai_response(row["response"])
17
+
18
+
19
+ def process_ai_response(response):
20
+ job_positions = xml2dict(response)
21
+
22
+ for index, xml_item_children in job_positions.items():
23
+ for url_child_xml in xml_item_children:
24
+
25
+ url_parent = urls_manager.get_url_by_id(url_child_xml["id"])
26
+ if len(url_parent) > 0:
27
+ url_parent = url_parent.iloc[0]
28
+ h1 = url_child_xml.copy()
29
+ del h1["id"]
30
+ del h1["url"]
31
+ h1 = " - ".join(h1.values())
32
+ if url_parent["description_links"] > 1 and url_child_xml["id"] != "":
33
+ print("-- child updated -- \n", url_child_xml["url"] , ":", h1)
34
+ urls_manager.set_url_h1(url_child_xml["url"], h1)
35
+ urls_manager.set_url_ai_processed_by_url(url_child_xml["url"], str(json.dumps(url_child_xml)))
36
+ if url_parent["url"] != url_child_xml["url"]:
37
+ urls_manager.set_url_ai_processed_by_url(url_parent["url"], "children-update")
38
+ else:
39
+ print("-- parent updated -- \n", url_parent["url"], ":", h1)
40
+ urls_manager.set_url_h1(url_parent["url"], h1)
41
+ urls_manager.set_url_ai_processed_by_url(url_parent["url"], str(json.dumps(url_child_xml)))
42
+
43
+
44
+ def xml2dict(xml_string):
45
+ soup = BeautifulSoup(xml_string, "html.parser")
46
+
47
+ children_items_dict = {}
48
+ for item in soup.find_all():
49
+ if(item.parent.name == "[document]"):
50
+ children_items_dict[item.name] = []
51
+ elif item.parent.name in children_items_dict:
52
+ children_items_dict[item.parent.name].append(_xml_children_to_dict(item))
53
+
54
+ return children_items_dict
55
+
56
+ def _xml_children_to_dict(xml):
57
+ item_dict = {}
58
+ for item in xml.find_all():
59
+ item_dict[item.name] = item.text
60
+ return item_dict
61
+
62
+ def process_with_ai(recursive=True, triggered_times=0):
63
+ triggered_times = triggered_times + 1
64
+
65
+ prompt = _get_prompt()
66
+ if not prompt:
67
+ return
68
+
69
+ url_type = "linkedin_post"
70
+ df = urls_manager.get_urls_by_url_type_for_ai_process(url_type)
71
+ if len(df) == 0:
72
+ print("no urls to process with ai anymore")
73
+ return
74
+
75
+ texts = ""
76
+ for index, row in df.iterrows():
77
+ texts = (
78
+ texts
79
+ + f"""
80
+ <text>
81
+ <id>{str(row['id'])}</id>
82
+ {row['description']}
83
+ </text>
84
+ """
85
+ )
86
+ if texts == "":
87
+ print("no urls to process")
88
+ return
89
+
90
+ print("starting...")
91
+ print("prompt:", prompt["name"])
92
+ print("model:", prompt["model"])
93
+ print("description:", prompt["description"])
94
+ prompt["instructions"] = prompt["instructions"].replace("{ohmyscrapper_texts}", texts)
95
+
96
+ # The client gets the API key from the environment variable `GEMINI_API_KEY`.
97
+ client = genai.Client()
98
+ response = client.models.generate_content(model=prompt["model"], contents=prompt["instructions"])
99
+ response = str(response.text)
100
+ urls_manager.add_ai_log(instructions=prompt["instructions"], response=response, model=prompt["model"], prompt_name=prompt["name"], prompt_file=prompt["prompt_file"])
101
+ print(response)
102
+ print("^^^^^^")
103
+ process_ai_response(response=response)
104
+ print("ending...")
105
+
106
+ for index, row in df.iterrows():
107
+ urls_manager.set_url_empty_ai_processed_by_id(row["id"])
108
+
109
+ if recursive:
110
+ wait = random.randint(1, 3)
111
+ print("sleeping for", wait, "seconds before next round")
112
+ time.sleep(wait)
113
+
114
+ if triggered_times > 5:
115
+ print("!!! This is a break to prevent budget accident$.")
116
+ print("You triggered", triggered_times, "times the AI processing function.")
117
+ print("If you are sure this is correct, you can re-call this function again.")
118
+ print("Please, check it.")
119
+ return
120
+
121
+ process_with_ai(recursive=recursive, triggered_times=triggered_times)
122
+
123
+ return
124
+
125
+ def _get_prompt():
126
+ prompts_path = "prompts"
127
+ default_prompt = """---
128
+ model: "gemini-2.5-flash"
129
+ name: "default-prompt"
130
+ description: "Put here your prompt description."
131
+ ---
132
+ Process with AI this prompt: {ohmyscrapper_texts}
133
+ """
134
+ if not os.path.exists(prompts_path):
135
+ os.mkdir(prompts_path)
136
+
137
+ open(f"{prompts_path}/prompt.md", "w").write(default_prompt)
138
+ print(f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there.")
139
+ return False
140
+
141
+ prompt_files = os.listdir(prompts_path)
142
+ if len(prompt_files) == 0:
143
+ open(f"{prompts_path}/prompt.md", "w").write(default_prompt)
144
+ print(f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there.")
145
+ return False
146
+ prompt = {}
147
+ if len(prompt_files) == 1:
148
+ prompt = _parse_prompt(prompts_path=prompts_path, prompt_file=prompt_files[0])
149
+ else:
150
+ print("Choose a prompt:")
151
+ prompts = {}
152
+ for index, file in enumerate(prompt_files):
153
+ prompts[index] = _parse_prompt(prompts_path=prompts_path, prompt_file=file)
154
+ print(index, ":", prompts[index]['name'])
155
+ input_prompt = input("Type the number of the prompt you want to use or 'q' to quit: ")
156
+ if input_prompt == "q":
157
+ return False
158
+ try:
159
+ prompt = prompts[int(input_prompt)]
160
+ except:
161
+ print("! Invalid prompt\n")
162
+ prompt = _get_prompt()
163
+ return prompt
164
+
165
+ def _parse_prompt(prompts_path, prompt_file):
166
+ prompt = {}
167
+ raw_prompt = open(f"{prompts_path}/{prompt_file}", "r").read().split("---")
168
+ prompt = yaml.safe_load(raw_prompt[1])
169
+ prompt["instructions"] = raw_prompt[2].strip()
170
+ prompt["prompt_file"] = prompt_file
171
+
172
+ return prompt
173
+ # TODO: Separate gemini from basic function
174
+ def _process_with_gemini(model, instructions):
175
+ response = """"""
176
+ return response
177
+
178
+
179
+ def _process_with_openai(model, instructions):
180
+ # import os
181
+ # from openai import OpenAI
182
+
183
+ # client = OpenAI(
184
+ # # This is the default and can be omitted
185
+ # api_key=os.environ.get("OPENAI_API_KEY"),
186
+ # )
187
+
188
+ # response = client.responses.create(
189
+ # model="gpt-4o",
190
+ # instructions="You are a coding assistant that talks like a pirate.",
191
+ # input="How do I check if a Python object is an instance of a class?",
192
+ # )
193
+
194
+ # print(response.output_text)
195
+
196
+ response = """"""
197
+ return response
@@ -0,0 +1,178 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+ import ohmyscrapper.modules.sniff_url as sniff_url
3
+ import ohmyscrapper.modules.load_txt as load_txt
4
+ import ohmyscrapper.modules.classify_urls as classify_urls
5
+
6
+ import time
7
+ import random
8
+
9
+
10
+ def process_linkedin_redirect(url_report, url):
11
+ print("linkedin_redirect")
12
+
13
+ if url_report["total-a-links"] < 5:
14
+ if "first-a-link" in url_report.keys():
15
+ url_destiny = url_report["first-a-link"]
16
+ else:
17
+ urls_manager.set_url_error(url=url["url"], value="error: no first-a-link")
18
+ print("no url for:", url["url"])
19
+ return
20
+ else:
21
+ if "og:url" in url_report.keys():
22
+ url_destiny = url_report["og:url"]
23
+ else:
24
+ urls_manager.set_url_error(url=url["url"], value="error: no og:url")
25
+ print("no url for:", url["url"])
26
+ return
27
+
28
+ print(url["url"], ">>", url_destiny)
29
+ urls_manager.add_url(url=url_destiny)
30
+ urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
31
+
32
+
33
+ def process_linkedin_feed(url_report, url):
34
+ print("linkedin_feed")
35
+
36
+ if "og:url" in url_report.keys():
37
+ url_destiny = url_report["og:url"]
38
+ else:
39
+ urls_manager.set_url_error(url=url["url"], value="error: no og:url")
40
+ print("no url for:", url["url"])
41
+ return
42
+
43
+ print(url["url"], ">>", url_destiny)
44
+ urls_manager.add_url(url=url_destiny)
45
+ urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
46
+
47
+
48
+ def process_linkedin_job(url_report, url):
49
+ print("linkedin_job")
50
+ changed = False
51
+ if "h1" in url_report.keys():
52
+ print(url["url"], ": ", url_report["h1"])
53
+ urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
54
+ changed = True
55
+ elif "og:title" in url_report.keys():
56
+ print(url["url"], ": ", url_report["og:title"])
57
+ urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
58
+ changed = True
59
+
60
+ if "description" in url_report.keys():
61
+ urls_manager.set_url_description(url=url["url"], value=url_report["description"])
62
+ changed = True
63
+ elif "og:description" in url_report.keys():
64
+ urls_manager.set_url_description(url=url["url"], value=url_report["og:description"])
65
+ changed = True
66
+ if not changed:
67
+ urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
68
+
69
+
70
+ def process_linkedin_post(url_report, url):
71
+ print("linkedin_post or generic")
72
+ print(url["url"])
73
+ changed = False
74
+ if "h1" in url_report.keys():
75
+ print(url["url"], ": ", url_report["h1"])
76
+ urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
77
+ changed = True
78
+ elif "og:title" in url_report.keys():
79
+ urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
80
+ changed = True
81
+ description = None
82
+ if "description" in url_report.keys():
83
+ description = url_report["description"]
84
+ changed = True
85
+ elif "og:description" in url_report.keys():
86
+ description = url_report["og:description"]
87
+ changed = True
88
+
89
+ if description is not None:
90
+ urls_manager.set_url_description(url=url["url"], value=description)
91
+ description_links = load_txt.put_urls_from_string(text_to_process=description, parent_url=url["url"])
92
+ urls_manager.set_url_description_links(url=url["url"], value=description_links)
93
+
94
+ if not changed:
95
+ urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
96
+
97
+
98
+ def scrap_url(url):
99
+ # TODO: Use get_urls_valid_prefix_by_id()
100
+ df = urls_manager.get_urls_valid_prefix()
101
+
102
+ # TODO: Need to change this
103
+
104
+ if url["url_type"] is None:
105
+ print("\n\ngeneric:", url["url"])
106
+ url["url_type"] = "generic"
107
+ else:
108
+ print("\n\n", url["url_type"] + ":", url["url"])
109
+ try:
110
+ url_report = sniff_url.get_tags(url=url["url"])
111
+ except Exception as e:
112
+ urls_manager.set_url_error(url=url["url"], value="error")
113
+ urls_manager.touch_url(url=url["url"])
114
+ print("\n\n!!! ERROR FOR:", url["url"])
115
+ print(
116
+ "\n\n!!! you can check the URL using the command sniff-url",
117
+ url["url"],
118
+ "\n\n",
119
+ )
120
+ return
121
+
122
+ # linkedin_redirect - linkedin (https://lnkd.in/)
123
+ if url["url_type"] == "linkedin_redirect":
124
+ process_linkedin_redirect(url_report=url_report, url=url)
125
+
126
+ # linkedin_feed - linkedin (https://%.linkedin.com/feed/)
127
+ if url["url_type"] == "linkedin_feed":
128
+ process_linkedin_feed(url_report=url_report, url=url)
129
+
130
+ # linkedin_job - linkedin (https://www.linkedin.com/jobs/)
131
+ if url["url_type"] == "linkedin_job":
132
+ process_linkedin_job(url_report=url_report, url=url)
133
+
134
+ # linkedin_job - linkedin (https://www.linkedin.com/jobs/)
135
+ if url["url_type"] == "linkedin_post" or url["url_type"] == "generic":
136
+ process_linkedin_post(url_report=url_report, url=url)
137
+
138
+ urls_manager.set_url_json(url=url["url"], value=url_report["json"])
139
+ urls_manager.touch_url(url=url["url"])
140
+
141
+
142
+ def isNaN(num):
143
+ return num != num
144
+
145
+
146
+ def scrap_urls(
147
+ recursive=False, ignore_valid_prefix=False, randomize=False, only_parents=True
148
+ ):
149
+ classify_urls.classify_urls()
150
+ urls = urls_manager.get_untouched_urls(
151
+ ignore_valid_prefix=ignore_valid_prefix,
152
+ randomize=randomize,
153
+ only_parents=only_parents,
154
+ )
155
+ if len(urls) == 0:
156
+ print("no urls to scrap")
157
+ return
158
+ for index, url in urls.iterrows():
159
+ scrap_url(url)
160
+
161
+ wait = random.randint(15, 20)
162
+ wait = random.randint(1, 3)
163
+ print("sleeping for", wait, "seconds")
164
+ time.sleep(wait)
165
+
166
+ classify_urls.classify_urls()
167
+ if recursive:
168
+ wait = random.randint(5, 10)
169
+ print("sleeping for", wait, "seconds before next round")
170
+ time.sleep(wait)
171
+ scrap_urls(
172
+ recursive=recursive,
173
+ ignore_valid_prefix=ignore_valid_prefix,
174
+ randomize=randomize,
175
+ only_parents=only_parents,
176
+ )
177
+ else:
178
+ print("ending...")
@@ -0,0 +1,7 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+
3
+
4
+ def seed():
5
+ urls_manager.seeds()
6
+ print("db seeded")
7
+ return
@@ -0,0 +1,127 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+ import math
3
+ from rich.console import Console
4
+ from rich.table import Table
5
+
6
+
7
+ def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
8
+ df = urls_manager.get_urls(limit=limit)
9
+
10
+ if simplify:
11
+ df.drop(columns=["description", "json"], inplace=True)
12
+
13
+ df.to_csv(csv_file, index=False)
14
+ print("--------------------")
15
+ print("Urls exported to", csv_file)
16
+
17
+ df.replace(
18
+ {
19
+ "description": {r"\n": " "},
20
+ },
21
+ regex=True,
22
+ inplace=True,
23
+ )
24
+ df.to_html(csv_file + "-preview.html", index=False)
25
+ print("Urls preview exported to", csv_file + "-preview.html")
26
+ print("--------------------")
27
+
28
+
29
+ def export_report(csv_file="output/report.csv"):
30
+ df = urls_manager.get_urls_report()
31
+
32
+ df.to_csv(csv_file, index=False)
33
+ _clear_file(csv_file)
34
+ print("--------------------")
35
+ print("Urls report exported to", csv_file)
36
+
37
+ df.replace(
38
+ {
39
+ "description": {r"\n": " "},
40
+ },
41
+ regex=True,
42
+ inplace=True,
43
+ )
44
+ df.to_html(csv_file + "-preview.html", index=False)
45
+ _clear_file(csv_file + "-preview.html")
46
+
47
+ print("Urls report preview exported to", csv_file + "-preview.html")
48
+ print("--------------------")
49
+
50
+ # TODO: Add transformation layer
51
+ def _clear_file(txt_tile):
52
+ with open(txt_tile, "r") as f:
53
+ content = f.read()
54
+ content = content.replace(" - -", " -")
55
+ content = content.replace(" -<", "<")
56
+ with open(txt_tile, "w") as f:
57
+ f.write(content)
58
+
59
+ def show_urls(limit=0, jump_to_page=0):
60
+ df = urls_manager.get_urls(limit=limit)
61
+ df.drop(columns=["json", "description"], inplace=True)
62
+ # df = df.head(n=20)
63
+
64
+ # https://medium.com/@inzaniak/create-tables-in-your-terminal-with-python-6747d68d71a6
65
+
66
+ total_items = len(df)
67
+ items_per_page = 15
68
+ n_pages = math.ceil(total_items / items_per_page)
69
+
70
+ last_popped = 0
71
+ for page in range(n_pages):
72
+
73
+ df_page = df.head(n=items_per_page)
74
+ df_t = df.T
75
+ for i in range(items_per_page):
76
+ if last_popped < total_items:
77
+ df_t.pop(last_popped)
78
+ last_popped += 1
79
+ df = df_t.T
80
+ if page < jump_to_page:
81
+ continue
82
+ show_table(df_page)
83
+
84
+ print("Page", page + 1, "of", n_pages)
85
+ user_input = input("Press enter to continue or type q to quit: ")
86
+ if user_input == "q":
87
+ break
88
+ if user_input.isnumeric():
89
+ jump_to_page = math.ceil(int(user_input))
90
+ if jump_to_page > n_pages or jump_to_page < 1:
91
+ print("This page does not exist")
92
+ jump_to_page = 0
93
+ else:
94
+ jump_to_page = jump_to_page - 1
95
+ if page < jump_to_page:
96
+ continue
97
+ elif jump_to_page >= 0:
98
+ show_urls(limit=limit, jump_to_page=jump_to_page)
99
+ break
100
+
101
+ return
102
+
103
+ return
104
+
105
+
106
+ # TODO: Change place
107
+ def show_table(df):
108
+ columns = df.columns.tolist()
109
+ df = df.to_dict(orient="records")
110
+ table = Table(show_header=True, header_style="bold magenta")
111
+ for column in columns:
112
+ table.add_column(column)
113
+
114
+ for row in df:
115
+ table.add_row(*[str(value) for value in row.values()])
116
+ console = Console()
117
+ console.print(table)
118
+
119
+
120
+ def show_urls_valid_prefix(limit=0):
121
+ print(urls_manager.get_urls_valid_prefix(limit=limit))
122
+ return
123
+
124
+
125
+ def show_url(url):
126
+ print(urls_manager.get_url_by_url(url=url).T)
127
+ return
@@ -0,0 +1,88 @@
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+
5
+
6
+ def sniff_url(url="https://www.linkedin.com/in/cesardesouzacardoso/", silent=False):
7
+ if not silent:
8
+ print("checking url:", url)
9
+ report_meta_tags = []
10
+ tags_to_search = [
11
+ "description",
12
+ "og:url",
13
+ "og:title",
14
+ "og:description",
15
+ "og:type",
16
+ "lnkd:url",
17
+ ]
18
+
19
+ r = requests.get(url=url)
20
+ soup = BeautifulSoup(r.text, "html.parser")
21
+
22
+ if not silent:
23
+ print("\n\n\n\n---- all <meta> tags ---\n")
24
+ i = 0
25
+ for meta_tag in soup.find_all("meta"):
26
+ if (
27
+ meta_tag.get("name") in tags_to_search
28
+ or meta_tag.get("property") in tags_to_search
29
+ ):
30
+ report_meta_tags.append(meta_tag)
31
+ i = i + 1
32
+ if not silent:
33
+ print("-- meta tag", i, "--")
34
+ print("name:", meta_tag.get("name"))
35
+ print("property:", meta_tag.get("property"))
36
+ print("content:", meta_tag.get("content"))
37
+ print("---------------- \n")
38
+
39
+ if not silent:
40
+ print("\n\n\n\n---- all <a> links ---")
41
+ i = 0
42
+ for a_tag in soup.find_all("a"):
43
+ i = i + 1
44
+ print("\n-- a link", i, "-- ")
45
+ print("target:", a_tag.get("target"))
46
+ print("text:", a_tag.text)
47
+ print("href:", a_tag.get("href"))
48
+ print("-------------- ")
49
+
50
+ final_report = {}
51
+ final_report["scrapped-url"] = url
52
+ if len(soup.find_all("h1")) > 0:
53
+ final_report["h1"] = soup.find("h1").text
54
+
55
+ for report_meta_tag in report_meta_tags:
56
+ if report_meta_tag.get("name") is not None:
57
+ final_report[report_meta_tag.get("name")] = report_meta_tag.get("content")
58
+ elif report_meta_tag.get("property") is not None:
59
+ final_report[report_meta_tag.get("property")] = report_meta_tag.get(
60
+ "content"
61
+ )
62
+
63
+ if len(soup.find_all("a")) > 0:
64
+ final_report["first-a-link"] = soup.find("a").get("href")
65
+ final_report["total-a-links"] = len(soup.find_all("a"))
66
+ else:
67
+ final_report["first-a-link"] = ""
68
+ final_report["total-a-links"] = 0
69
+
70
+ if len(soup.find_all("h2")) > 0:
71
+ final_report["h2"] = soup.find("h2").text
72
+
73
+ if len(soup.find_all("meta")) > 0:
74
+ final_report["total-meta-tags"] = len(soup.find_all("meta"))
75
+ else:
76
+ final_report["total-meta-tags"] = 0
77
+
78
+ final_report["json"] = json.dumps(final_report)
79
+ if not silent:
80
+ print("\n\n\n----report---\n")
81
+ for key in final_report:
82
+ print("* ", key, ":", final_report[key])
83
+
84
+ return final_report
85
+
86
+
87
+ def get_tags(url):
88
+ return sniff_url(url=url, silent=True)
@@ -0,0 +1,7 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+
3
+
4
+ def untouch_all():
5
+ urls_manager.untouch_all_urls()
6
+ print("urls have been untouched")
7
+ return
@@ -0,0 +1,110 @@
1
+ Metadata-Version: 2.3
2
+ Name: ohmyscrapper
3
+ Version: 0.2.1
4
+ Summary: This project aims to create a text-based scraper containing links to create a final PDF with general information about job openings.
5
+ Author: Cesar Cardoso gh@bouli
6
+ Author-email: Cesar Cardoso gh@bouli <hello@cesarcardoso.cc>
7
+ Requires-Dist: beautifulsoup4>=4.14.3
8
+ Requires-Dist: google-genai>=1.55.0
9
+ Requires-Dist: markdown>=3.10
10
+ Requires-Dist: pandas>=2.3.3
11
+ Requires-Dist: python-dotenv>=1.2.1
12
+ Requires-Dist: pyyaml>=6.0.3
13
+ Requires-Dist: requests>=2.32.5
14
+ Requires-Dist: rich>=14.2.0
15
+ Requires-Dist: urlextract>=1.9.0
16
+ Requires-Python: >=3.11
17
+ Description-Content-Type: text/markdown
18
+
19
+ # OhMyScrapper - v0.2.1
20
+
21
+ This project aims to create a text-based scraper containing links to create a
22
+ final PDF with general information about job openings.
23
+
24
+ > This project is using [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer) by default.
25
+
26
+ ## Scope
27
+
28
+ - Read texts;
29
+ - Extract links;
30
+ - Use meta og:tags to extract information;
31
+
32
+ ## Installation
33
+
34
+ I recomend to use the [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer), so you can just use the command bellow and everything is installed:
35
+ ```shell
36
+ uv sync
37
+ ```
38
+
39
+ ## How to use and test (development only)
40
+
41
+ OhMyScrapper works in 3 stages:
42
+
43
+ 1. It collects and loads urls from a text (by default `input/_chat.txt`) in a database;
44
+ 2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
45
+ 3. Export a list of urls in CSV files;
46
+
47
+ You can do 3 stages with the command:
48
+ ```shell
49
+ make start
50
+ ```
51
+ > Remember to add your text file in the folder `/input` with the name `_chat.txt`!
52
+
53
+ You will find the exported files in the folder `/output` like this:
54
+ - `/output/report.csv`
55
+ - `/output/report.csv-preview.html`
56
+ - `/output/urls-simplified.csv`
57
+ - `/output/urls-simplified.csv-preview.html`
58
+ - `/output/urls.csv`
59
+ - `/output/urls.csv-preview.html`
60
+
61
+ ### BUT: if you want to do step by step, here it is:
62
+
63
+ First we load a text file you would like to look for urls, the idea here is to
64
+ use the whatsapp history, but it works with any txt file.
65
+
66
+ The default file is `input/_chat.txt`. If you have the default file you just use
67
+ the command `load`:
68
+ ```shell
69
+ make load
70
+ ```
71
+ or, if you have another file, just use the argument `-file` like this:
72
+ ```shell
73
+ uv run main.py load -file=my-text-file.txt
74
+ ```
75
+ That will create a database if it doesn't exist and store every url the oh-my-scrapper
76
+ find. After that, let's scrap the urls with the command `scrap-urls`:
77
+
78
+ ```shell
79
+ make scrap-urls
80
+ ```
81
+
82
+ That will scrap only the linkedin urls we are interested in. For now they are:
83
+ - linkedin_post: https://%.linkedin.com/posts/%
84
+ - linkedin_redirect: https://lnkd.in/%
85
+ - linkedin_job: https://%.linkedin.com/jobs/view/%
86
+ - linkedin_feed" https://%.linkedin.com/feed/%
87
+ - linkedin_company: https://%.linkedin.com/company/%
88
+
89
+ But we can use every other one generically using the argument `--ignore-type`:
90
+ ```shell
91
+ uv run main.py scrap-urls --ignore-type
92
+ ```
93
+
94
+ And we can ask to make it recursively adding the argument `--recursive`:
95
+ ```shell
96
+ uv run main.py scrap-urls --recursive
97
+ ```
98
+ > !!! important: we are not sure about blocks we can have for excess of requests
99
+
100
+ And we can finally export with the command:
101
+ ```shell
102
+ make export
103
+ ```
104
+
105
+
106
+ That's the basic usage!
107
+ But you can understand more using the help:
108
+ ```shell
109
+ uv run main.py --help
110
+ ```
@@ -0,0 +1,16 @@
1
+ ohmyscrapper/__init__.py,sha256=OOoRFtkBKaTIf74FStI0MGtk-LUQOuN0QnBZRfRWauA,5145
2
+ ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
+ ohmyscrapper/models/urls_manager.py,sha256=xKql_xdwfRwgpMyriuIrZ0Srz4gYQGMfWClEWpGRtNE,11183
4
+ ohmyscrapper/modules/classify_urls.py,sha256=eyHtTHDZp2pGmYw_X-7LrbeVOgDPcRQdhu0oEuwQtKA,743
5
+ ohmyscrapper/modules/load_txt.py,sha256=mL60OGsh-R80P88vxyqvfBEFag9yhSFFbg5pwtu1f90,889
6
+ ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
7
+ ohmyscrapper/modules/process_with_ai.py,sha256=TpumucIVNZulKOw2idy4hD3vG5IhG5pbhyJImYFP8g0,6844
8
+ ohmyscrapper/modules/scrap_urls.py,sha256=KQVs3R03X80hmvvJAU1SqnNhwXEeVV99WlN8TxSKqA8,6097
9
+ ohmyscrapper/modules/seed.py,sha256=KeTSbmTdNTkVCtzk9iQmeuEqB0kG-rTZJb2a1WdROL4,129
10
+ ohmyscrapper/modules/show.py,sha256=u0L9uxgU8Xt_-myA3r7byuOmnX_-2gkpTtXWkXon1ns,3572
11
+ ohmyscrapper/modules/sniff_url.py,sha256=jQDc7aSimuOOedw2fSXZlf6_o0OqQHOr6NsWb4n0XgI,2720
12
+ ohmyscrapper/modules/untouch_all.py,sha256=E1U9e3sOG7suzc8ZTWcYiQQo9mPmLJ0piXdXUjFLEd4,162
13
+ ohmyscrapper-0.2.1.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
14
+ ohmyscrapper-0.2.1.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
15
+ ohmyscrapper-0.2.1.dist-info/METADATA,sha256=Sl1HuVlxTSSAYz9ga0zJ9xUpWGY2NZOkNu1xTNtGUu8,3411
16
+ ohmyscrapper-0.2.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.9.17
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ ohmyscrapper = ohmyscrapper:main
3
+