ohmyscrapper 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,155 @@
1
+ import argparse
2
+
3
+ from ohmyscrapper.modules.classify_urls import classify_urls
4
+ from ohmyscrapper.modules.sniff_url import sniff_url
5
+ from ohmyscrapper.modules.load_txt import load_txt
6
+ from ohmyscrapper.modules.seed import seed
7
+ from ohmyscrapper.modules.scrap_urls import scrap_urls
8
+ from ohmyscrapper.modules.show import (
9
+ show_url,
10
+ show_urls,
11
+ show_urls_valid_prefix,
12
+ export_urls,
13
+ export_report,
14
+ )
15
+ from ohmyscrapper.modules.untouch_all import untouch_all
16
+ from ohmyscrapper.modules.process_with_ai import process_with_ai
17
+ from ohmyscrapper.modules.merge_dbs import merge_dbs
18
+
19
+
20
+ def main():
21
+ parser = argparse.ArgumentParser(prog="ohmyscrapper")
22
+ parser.add_argument("--version", action="version", version="%(prog)s v0.1.1")
23
+
24
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
25
+
26
+ ai_process_parser = subparsers.add_parser(
27
+ "process-with-ai", help="Process with AI."
28
+ )
29
+ seed_parser = subparsers.add_parser(
30
+ "seed", help="Seed database. Necessary to classify urls."
31
+ )
32
+ untouch_parser = subparsers.add_parser(
33
+ "untouch-all", help="Untouch all urls. That resets classification"
34
+ )
35
+
36
+ classify_urls_parser = subparsers.add_parser(
37
+ "classify-urls", help="Classify loaded urls"
38
+ )
39
+ classify_urls_parser.add_argument(
40
+ "--recursive", default=False, help="Run in recursive mode", action="store_true"
41
+ )
42
+
43
+ load_txt_parser = subparsers.add_parser("load", help="Load txt file")
44
+ load_txt_parser.add_argument(
45
+ "-file", default="input/_chat.txt", help="File path. Default is input/_chat.txt"
46
+ )
47
+
48
+ scrap_urls_parser = subparsers.add_parser("scrap-urls", help="Scrap urls")
49
+ scrap_urls_parser.add_argument(
50
+ "--recursive", default=False, help="Run in recursive mode", action="store_true"
51
+ )
52
+ scrap_urls_parser.add_argument(
53
+ "--ignore-type", default=False, help="Ignore urls types", action="store_true"
54
+ )
55
+ scrap_urls_parser.add_argument(
56
+ "--randomize", default=False, help="Random order", action="store_true"
57
+ )
58
+ scrap_urls_parser.add_argument(
59
+ "--only-parents", default=False, help="Only parents urls", action="store_true"
60
+ )
61
+
62
+ sniff_url_parser = subparsers.add_parser("sniff-url", help="Check url")
63
+ sniff_url_parser.add_argument(
64
+ "url", default="https://cesarcardoso.cc/", help="Url to sniff"
65
+ )
66
+
67
+ show_urls_parser = subparsers.add_parser("show", help="Show urls and prefixes")
68
+ show_urls_parser.add_argument(
69
+ "--prefixes", default=False, help="Show urls valid prefix", action="store_true"
70
+ )
71
+ show_urls_parser.add_argument("--limit", default=0, help="Limit of lines to show")
72
+ show_urls_parser.add_argument("-url", default="", help="Url to show")
73
+
74
+ export_parser = subparsers.add_parser("export", help="Export urls to csv.")
75
+ export_parser.add_argument("--limit", default=0, help="Limit of lines to export")
76
+ export_parser.add_argument(
77
+ "--file",
78
+ default="output/urls.csv",
79
+ help="File path. Default is output/urls.csv",
80
+ )
81
+ export_parser.add_argument(
82
+ "--simplify",
83
+ default=False,
84
+ help="Ignore json and descriptions",
85
+ action="store_true",
86
+ )
87
+
88
+ report_parser = subparsers.add_parser("report", help="Export urls report to csv.")
89
+ merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
90
+
91
+ # TODO: What is that?
92
+ # seed_parser.set_defaults(func=seed)
93
+ # classify_urls_parser.set_defaults(func=classify_urls)
94
+ # load_txt_parser.set_defaults(func=load_txt)
95
+
96
+ args = parser.parse_args()
97
+
98
+ if args.command == "classify-urls":
99
+ classify_urls(args.recursive)
100
+ return
101
+
102
+ if args.command == "load":
103
+ load_txt(args.file)
104
+ return
105
+
106
+ if args.command == "seed":
107
+ seed()
108
+ return
109
+
110
+ if args.command == "untouch-all":
111
+ untouch_all()
112
+ return
113
+
114
+ if args.command == "sniff-url":
115
+ sniff_url(args.url)
116
+ return
117
+
118
+ if args.command == "scrap-urls":
119
+ scrap_urls(
120
+ recursive=args.recursive,
121
+ ignore_valid_prefix=args.ignore_type,
122
+ randomize=args.randomize,
123
+ only_parents=args.only_parents,
124
+ )
125
+ return
126
+
127
+ if args.command == "show":
128
+ if args.prefixes:
129
+ show_urls_valid_prefix(int(args.limit))
130
+ return
131
+ if args.url != "":
132
+ show_url(args.url)
133
+ return
134
+ show_urls(int(args.limit))
135
+ return
136
+
137
+ if args.command == "export":
138
+ export_urls(limit=int(args.limit), csv_file=args.file, simplify=args.simplify)
139
+ return
140
+
141
+ if args.command == "process-with-ai":
142
+ process_with_ai()
143
+ return
144
+
145
+ if args.command == "report":
146
+ export_report()
147
+ return
148
+
149
+ if args.command == "merge_dbs":
150
+ merge_dbs()
151
+ return
152
+
153
+
154
+ if __name__ == "__main__":
155
+ main()
@@ -0,0 +1,4 @@
1
+ from . import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
@@ -0,0 +1,388 @@
1
+ import os
2
+ import sqlite3
3
+ import time
4
+ import glob
5
+ import pandas as pd
6
+ from urllib.parse import urlparse, urlunparse
7
+
8
+
9
+ def get_db_dir():
10
+ if not os.path.exists("db"):
11
+ os.mkdir("db")
12
+ return "db"
13
+
14
+
15
+ def get_db_path():
16
+ return get_db_dir() + "/local.db"
17
+
18
+
19
+ def get_db_connection():
20
+ return sqlite3.connect(get_db_path())
21
+
22
+
23
+ # TODO: check if it makes sense
24
+ conn = get_db_connection()
25
+
26
+
27
+ def create_tables():
28
+
29
+ c = conn.cursor()
30
+ c.execute(
31
+ "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_id INTEGER DEFAULT 0, url TEXT UNIQUE, url_destiny TEXT, h1 TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
32
+ )
33
+ c.execute(
34
+ "CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, created_at DATETIME)"
35
+ )
36
+
37
+ c.execute(
38
+ "CREATE TABLE IF NOT EXISTS urls_valid_prefix (id INTEGER PRIMARY KEY, url_prefix TEXT UNIQUE, url_type TEXT)"
39
+ )
40
+
41
+ return pd.read_sql_query("SELECT * FROM urls LIMIT 100", conn)
42
+
43
+
44
+ # TODO: not sure this should be something. depends on the project
45
+ def seeds():
46
+ create_tables()
47
+
48
+ add_urls_valid_prefix("https://%.linkedin.com/posts/%", "linkedin_post")
49
+ add_urls_valid_prefix("https://lnkd.in/%", "linkedin_redirect")
50
+ add_urls_valid_prefix("https://%.linkedin.com/jobs/view/%", "linkedin_job")
51
+ add_urls_valid_prefix("https://%.linkedin.com/feed/%", "linkedin_feed")
52
+ add_urls_valid_prefix("https://%.linkedin.com/company/%", "linkedin_company")
53
+
54
+ # add_urls_valid_prefix("%.pdf", "pdf")
55
+ # add_url('https://imazon.org.br/categorias/artigos-cientificos/')
56
+
57
+ return True
58
+
59
+
60
+ def add_urls_valid_prefix(url_prefix, url_type):
61
+ conn = get_db_connection()
62
+
63
+ df = pd.read_sql_query(
64
+ f"SELECT * FROM urls_valid_prefix WHERE url_prefix = '{url_prefix}'", conn
65
+ )
66
+ if len(df) == 0:
67
+ c = conn.cursor()
68
+ c.execute(
69
+ "INSERT INTO urls_valid_prefix (url_prefix, url_type) VALUES (?, ?)",
70
+ (url_prefix, url_type),
71
+ )
72
+ conn.commit()
73
+
74
+
75
+ def get_urls_valid_prefix_by_type(url_type):
76
+ df = pd.read_sql_query(
77
+ f"SELECT * FROM urls_valid_prefix WHERE url_type = '{url_type}'", conn
78
+ )
79
+ return df
80
+
81
+
82
+ def get_urls_valid_prefix_by_id(id):
83
+ df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix WHERE id = '{id}'", conn)
84
+ return df
85
+
86
+
87
+ # TODO: pagination required
88
+ def get_urls_valid_prefix(limit=0):
89
+ if limit > 0:
90
+ df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix LIMIT {limit}", conn)
91
+ else:
92
+ df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix", conn)
93
+ return df
94
+
95
+
96
+ # TODO: pagination required
97
+ def get_urls(limit=0):
98
+ if limit > 0:
99
+ df = pd.read_sql_query(
100
+ f"SELECT * FROM urls LIMIT {limit} ORDER BY history ASC", conn
101
+ )
102
+ else:
103
+ df = pd.read_sql_query(f"SELECT * FROM urls ORDER BY history ASC", conn)
104
+ return df
105
+
106
+
107
+ def get_urls_report():
108
+ sql = """
109
+ WITH parents_id AS (
110
+ SELECT parent_id FROM urls WHERE parent_id != 0 GROUP BY parent_id
111
+ ),
112
+ parents AS (
113
+ SELECT
114
+ u.id,
115
+ u.h1
116
+ FROM urls u
117
+ INNER JOIN parents_id p
118
+ ON u.id = p.parent_id
119
+ )
120
+ SELECT
121
+ u.id,
122
+ u.url_type,
123
+ u.url,
124
+ u.h1,
125
+ p.id as parent_id,
126
+ p.h1 as parent_h1
127
+ FROM urls u
128
+ LEFT JOIN parents p
129
+ ON u.parent_id = p.id
130
+ WHERE u.history = 0 AND u.id NOT IN (SELECT id FROM parents)
131
+ """
132
+ df = pd.read_sql_query(sql, conn)
133
+
134
+ return df
135
+
136
+
137
+ def get_url_by_url(url):
138
+ url = clean_url(url)
139
+ df = pd.read_sql_query(f"SELECT * FROM urls WHERE url = '{url}'", conn)
140
+
141
+ return df
142
+
143
+
144
+ def get_url_by_id(id):
145
+ df = pd.read_sql_query(f"SELECT * FROM urls WHERE id = '{id}'", conn)
146
+
147
+ return df
148
+
149
+
150
+ def get_urls_by_url_type(url_type):
151
+ df = pd.read_sql_query(
152
+ f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}'", conn
153
+ )
154
+ return df
155
+
156
+
157
+ def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
158
+ df = pd.read_sql_query(
159
+ f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}' AND ai_processed = 0 LIMIT {limit}",
160
+ conn,
161
+ )
162
+ return df
163
+
164
+
165
+ def get_url_like_unclassified(like_condition):
166
+ df = pd.read_sql_query(
167
+ f"SELECT * FROM urls WHERE history = 0 AND url LIKE '{like_condition}' AND url_type IS NULL",
168
+ conn,
169
+ )
170
+ return df
171
+
172
+
173
+ def add_url(url, h1=None, parent_id=0):
174
+ url = clean_url(url)
175
+ c = conn.cursor()
176
+
177
+ if h1 is not None:
178
+ h1 = h1.strip()
179
+
180
+ if parent_id is None:
181
+ parent_id = 0
182
+
183
+ parent_id = int(parent_id)
184
+
185
+ if len(get_url_by_url(url)) == 0:
186
+ c.execute(
187
+ "INSERT INTO urls (url, h1, parent_id, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
188
+ (url, h1, parent_id, int(time.time())),
189
+ )
190
+ conn.commit()
191
+
192
+ return get_url_by_url(url)
193
+
194
+
195
+ def add_ai_log(instructions, response, model):
196
+ c = conn.cursor()
197
+
198
+ c.execute(
199
+ "INSERT INTO ai_log (instructions, response, model, created_at) VALUES (?, ?, ?, ?)",
200
+ (instructions, response, model, int(time.time())),
201
+ )
202
+ conn.commit()
203
+
204
+
205
+ def set_url_destiny(url, destiny):
206
+ url = clean_url(url)
207
+ destiny = clean_url(destiny)
208
+ c = conn.cursor()
209
+ url_obj = get_url_by_url(url)
210
+ c.execute("UPDATE urls SET url_destiny = ? WHERE url = ?", (destiny, url))
211
+ c.execute(
212
+ "UPDATE urls SET parent_id = ? WHERE url = ?",
213
+ (int(url_obj.iloc[0]["id"]), destiny),
214
+ )
215
+
216
+ conn.commit()
217
+
218
+
219
+ def set_url_h1(url, value):
220
+ value = str(value).strip()
221
+ url = clean_url(url)
222
+ c = conn.cursor()
223
+ c.execute("UPDATE urls SET h1 = ? WHERE url = ?", (value, url))
224
+ conn.commit()
225
+
226
+
227
+ def set_url_h1_by_id(id, value):
228
+ value = str(value).strip()
229
+
230
+ c = conn.cursor()
231
+ c.execute("UPDATE urls SET h1 = ? WHERE id = ?", (value, id))
232
+ conn.commit()
233
+
234
+
235
+ def set_url_ai_processed_by_id(id):
236
+ value = 1
237
+ value = str(value).strip()
238
+ c = conn.cursor()
239
+ c.execute("UPDATE urls SET ai_processed = ? WHERE id = ?", (value, id))
240
+ conn.commit()
241
+
242
+
243
+ def set_url_ai_processed_by_url(url):
244
+ value = 1
245
+ value = str(value).strip()
246
+ url = clean_url(url)
247
+ c = conn.cursor()
248
+ c.execute("UPDATE urls SET ai_processed = ? WHERE url = ?", (value, url))
249
+ conn.commit()
250
+
251
+
252
+ def set_url_description(url, value):
253
+ url = clean_url(url)
254
+ c = conn.cursor()
255
+ c.execute("UPDATE urls SET description = ? WHERE url = ?", (value, url))
256
+ conn.commit()
257
+
258
+
259
+ def set_url_description_links(url, value):
260
+ url = clean_url(url)
261
+ c = conn.cursor()
262
+ c.execute("UPDATE urls SET description_links = ? WHERE url = ?", (value, url))
263
+ conn.commit()
264
+
265
+
266
+ def set_url_json(url, value):
267
+ url = clean_url(url)
268
+ c = conn.cursor()
269
+ c.execute("UPDATE urls SET json = ? WHERE url = ?", (value, url))
270
+ conn.commit()
271
+
272
+
273
+ def set_url_error(url, value):
274
+ url = clean_url(url)
275
+ c = conn.cursor()
276
+ c.execute("UPDATE urls SET error = ? WHERE url = ?", (value, url))
277
+ conn.commit()
278
+
279
+
280
+ def set_url_type_by_id(url_id, url_type):
281
+ c = conn.cursor()
282
+ c.execute(f"UPDATE urls SET url_type = '{url_type}' WHERE id = {url_id}")
283
+ conn.commit()
284
+
285
+
286
+ def clean_url(url):
287
+ if url[0:7] == "http://":
288
+ url = "https://" + url[7:]
289
+
290
+ if url[0:8] != "https://":
291
+ url = "https://" + url
292
+ url = url.split("#")[0]
293
+ old_query = urlparse(url).query.split("&")
294
+ new_query = []
295
+ for i in old_query:
296
+ if i[0:4] != "utm_":
297
+ new_query.append(i)
298
+
299
+ url = urlunparse(urlparse(url)._replace(query="&".join(new_query))).replace("'", "")
300
+ return url
301
+
302
+
303
+ def get_untouched_urls(
304
+ limit=10, randomize=True, ignore_valid_prefix=False, only_parents=True
305
+ ):
306
+ where_sql = ""
307
+ if not ignore_valid_prefix:
308
+ where_sql += " AND url_type IS NOT NULL "
309
+
310
+ if only_parents:
311
+ where_sql += " AND parent_id = 0 "
312
+
313
+ if randomize:
314
+ random_sql = " RANDOM() "
315
+ else:
316
+ random_sql = " created_at DESC "
317
+ sql = f"SELECT * FROM urls WHERE 1 = 1 AND history = 0 {where_sql} AND last_touch IS NULL ORDER BY {random_sql} LIMIT {limit}"
318
+ df = pd.read_sql_query(sql, conn)
319
+ return df
320
+
321
+
322
+ def touch_url(url):
323
+ url = clean_url(url)
324
+ c = conn.cursor()
325
+ c.execute("UPDATE urls SET last_touch = ? WHERE url = ?", (int(time.time()), url))
326
+ conn.commit()
327
+
328
+
329
+ def untouch_url(url):
330
+ url = clean_url(url)
331
+ c = conn.cursor()
332
+ c.execute("UPDATE urls SET last_touch = NULL WHERE url = ?", (url))
333
+ conn.commit()
334
+
335
+
336
+ def untouch_all_urls():
337
+ c = conn.cursor()
338
+ c.execute("UPDATE urls SET last_touch = NULL WHERE history = 0")
339
+ conn.commit()
340
+
341
+
342
+ def set_all_urls_as_history():
343
+ c = conn.cursor()
344
+ c.execute("UPDATE urls SET history = 1")
345
+ conn.commit()
346
+
347
+
348
+ def merge_dbs() -> None:
349
+ production_db_file = get_db_path()
350
+ db_number = -1
351
+ dir = get_db_dir()
352
+ list_of_files = glob.glob(dir + "/*.db")
353
+ list_of_files.remove(production_db_file)
354
+ if len(list_of_files) > 0:
355
+ print("\nAvailable dbs:")
356
+ for index, file in enumerate(list_of_files):
357
+ print(index, ":", file)
358
+ while db_number < 0 or db_number >= len(list_of_files):
359
+ db_number = int(input("Choose the db to merge: "))
360
+
361
+ print(list_of_files[db_number])
362
+ source_conn = sqlite3.connect(list_of_files[db_number])
363
+ df = pd.read_sql_query("SELECT * FROM urls", source_conn)
364
+ for index, row in df.iterrows():
365
+ merge_url(
366
+ row["url"],
367
+ f"merged from {list_of_files[db_number]}",
368
+ row["last_touch"],
369
+ row["created_at"],
370
+ row["description"],
371
+ row["json"],
372
+ )
373
+ # ßmerge_url(df)
374
+
375
+
376
+ def merge_url(url, h1, last_touch, created_at, description, json):
377
+ url = clean_url(url)
378
+ c = conn.cursor()
379
+
380
+ if h1 is not None:
381
+ h1 = h1.strip()
382
+
383
+ if len(get_url_by_url(url)) == 0:
384
+ c.execute(
385
+ "INSERT INTO urls (url, h1, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
386
+ (url, h1, last_touch, created_at, description, json),
387
+ )
388
+ conn.commit()
@@ -0,0 +1,23 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+ import pandas as pd
3
+ import time
4
+
5
+
6
+ def classify_urls(recursive=False):
7
+ urls_manager.seeds()
8
+ df = urls_manager.get_urls_valid_prefix()
9
+
10
+ keep_alive = True
11
+ while keep_alive:
12
+ print("waking up!")
13
+ for index, row_prefix in df.iterrows():
14
+ df_urls = urls_manager.get_url_like_unclassified(row_prefix["url_prefix"])
15
+ for index, row_urls in df_urls.iterrows():
16
+ urls_manager.set_url_type_by_id(row_urls["id"], row_prefix["url_type"])
17
+
18
+ if not recursive:
19
+ print("ending...")
20
+ keep_alive = False
21
+ else:
22
+ print("sleeping...")
23
+ time.sleep(10)
@@ -0,0 +1,32 @@
1
+ import os
2
+ from urlextract import URLExtract
3
+ import ohmyscrapper.models.urls_manager as urls_manager
4
+
5
+
6
+ def load_txt(file_name="input/_chat.txt"):
7
+
8
+ if not os.path.exists("input"):
9
+ os.mkdir("input")
10
+
11
+ urls_manager.create_tables()
12
+ urls_manager.seeds()
13
+ # make it recursive for all files
14
+ text_file_content = open(file_name, "r").read()
15
+
16
+ put_urls_from_string(text_file_content)
17
+
18
+ # move_it_to_processed
19
+ print("--------------------")
20
+ print(file_name, "processed")
21
+
22
+
23
+ def put_urls_from_string(text_to_process, parent_id=None):
24
+ if isinstance(text_to_process, str):
25
+ extractor = URLExtract()
26
+ for url in extractor.find_urls(text_to_process):
27
+ urls_manager.add_url(url, parent_id=parent_id)
28
+ print(url, "added")
29
+
30
+ return len(extractor.find_urls(text_to_process))
31
+ else:
32
+ return 0
@@ -0,0 +1,6 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+
3
+
4
+ def merge_dbs():
5
+ urls_manager.merge_dbs()
6
+ return
@@ -0,0 +1,160 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+ from bs4 import BeautifulSoup
3
+ from google import genai
4
+ from dotenv import load_dotenv
5
+ import random
6
+ import time
7
+ import os
8
+ import yaml
9
+
10
+ load_dotenv()
11
+
12
+
13
+ def process_with_ai(recursive=True):
14
+ prompt = _get_prompt()
15
+ if not prompt:
16
+ return
17
+
18
+ url_type = "linkedin_post"
19
+ df = urls_manager.get_urls_by_url_type_for_ai_process(url_type)
20
+
21
+ if len(df) == 0:
22
+ print("no urls to process with ai anymore")
23
+ return
24
+ texts = ""
25
+ for index, row in df.iterrows():
26
+ texts = (
27
+ texts
28
+ + f"""
29
+ <texto>
30
+ <id>{str(row['id'])}</id>
31
+ {row['description']}
32
+ </texto>
33
+ """
34
+ )
35
+ if texts == "":
36
+ print("no urls to process")
37
+ return
38
+
39
+ print("starting...")
40
+ print("prompt:", prompt["name"])
41
+ print("model:", prompt["model"])
42
+ print("description:", prompt["description"])
43
+ prompt["instrusctions"] = prompt["instrusctions"].replace("{ohmyscrapper_texts}", texts)
44
+
45
+ # The client gets the API key from the environment variable `GEMINI_API_KEY`.
46
+ client = genai.Client()
47
+ response = client.models.generate_content(model=prompt["model"], contents=prompt["instrusctions"])
48
+ response = str(response.text)
49
+ urls_manager.add_ai_log(instructions=prompt["instrusctions"], response=response, model=prompt["model"])
50
+ print(response)
51
+ print("^^^^^^")
52
+ soup = BeautifulSoup(response, "html.parser")
53
+ for vaga in soup.find_all(prompt["xml-item"]):
54
+
55
+ url = urls_manager.get_url_by_id(vaga.find("id").text)
56
+ if len(url) > 0:
57
+ url = url.iloc[0]
58
+ # TODO: make it dynamic
59
+ h1 = vaga.find("titulo").text
60
+ if (
61
+ vaga.find("contratante").text != "desconhecido"
62
+ and vaga.find("contratante").text != ""
63
+ ):
64
+ h1 = h1 + " - " + vaga.find("contratante").text
65
+ if url["description_links"] > 1 and vaga.find("id").text != "":
66
+ urls_manager.set_url_h1(vaga.find("url").text, h1)
67
+ urls_manager.set_url_ai_processed_by_url(vaga.find("url").text)
68
+
69
+ print("-- child updated -- ", vaga.find("url").text, h1)
70
+ elif url["description_links"] <= 1:
71
+ urls_manager.set_url_h1_by_id(vaga.find("id").text, h1)
72
+ urls_manager.set_url_ai_processed_by_id(vaga.find("id").text)
73
+ print("-- parent updated -- ", url["url"], h1)
74
+ else:
75
+ print("-- not updated -- ", url["url"], h1)
76
+
77
+ print("ending...")
78
+
79
+ if recursive:
80
+ wait = random.randint(1, 3)
81
+ print("sleeping for", wait, "seconds before next round")
82
+ time.sleep(wait)
83
+ process_with_ai(recursive=recursive)
84
+
85
+ return
86
+
87
+ def _get_prompt():
88
+ prompts_path = "prompts"
89
+ default_prompt = """---
90
+ model: "gemini-2.5-flash"
91
+ name: "default-prompt"
92
+ description: "Put here your prompt description."
93
+ xml-item: "position"
94
+ ---
95
+ Process with AI this prompt: {ohmyscrapper_texts}
96
+ """
97
+ if not os.path.exists(prompts_path):
98
+ os.mkdir(prompts_path)
99
+
100
+ open(f"{prompts_path}/prompt.md", "w").write(default_prompt)
101
+ print(f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there.")
102
+ return False
103
+
104
+ prompt_files = os.listdir(prompts_path)
105
+ if len(prompt_files) == 0:
106
+ open(f"{prompts_path}/prompt.md", "w").write(default_prompt)
107
+ print(f"You didn't have a prompt file. One was created in the /{prompts_path} folder. You can change it there.")
108
+ return False
109
+
110
+ if len(prompt_files) == 1:
111
+ prompt = _parse_prompt(prompts_path, prompt_files[0])
112
+ else:
113
+ print("Choose a prompt:")
114
+ prompts = {}
115
+ for index, file in enumerate(prompt_files):
116
+ prompts[index] = _parse_prompt(prompts_path, file)
117
+ print(index, ":", prompts[index]['name'])
118
+ input_prompt = input("Type the number of the prompt you want to use or 'q' to quit: ")
119
+ if input_prompt == "q":
120
+ return False
121
+ try:
122
+ prompt = prompts[int(input_prompt)]
123
+ except:
124
+ print("! Invalid prompt\n")
125
+ prompt = _get_prompt()
126
+
127
+ return prompt
128
+
129
+ def _parse_prompt(prompts_path, prompt_file):
130
+ prompt = {}
131
+ raw_prompt = open(f"{prompts_path}/{prompt_file}", "r").read().split("---")
132
+ prompt = yaml.safe_load(raw_prompt[1])
133
+ prompt["instrusctions"] = raw_prompt[2].strip()
134
+
135
+ return prompt
136
+ # TODO: Separate gemini from basic function
137
+ def _process_with_gemini(model, instructions):
138
+ response = """"""
139
+ return response
140
+
141
+
142
+ def _process_with_openai(model, instructions):
143
+ # import os
144
+ # from openai import OpenAI
145
+
146
+ # client = OpenAI(
147
+ # # This is the default and can be omitted
148
+ # api_key=os.environ.get("OPENAI_API_KEY"),
149
+ # )
150
+
151
+ # response = client.responses.create(
152
+ # model="gpt-4o",
153
+ # instructions="You are a coding assistant that talks like a pirate.",
154
+ # input="How do I check if a Python object is an instance of a class?",
155
+ # )
156
+
157
+ # print(response.output_text)
158
+
159
+ response = """"""
160
+ return response
@@ -0,0 +1,179 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+ import ohmyscrapper.modules.sniff_url as sniff_url
3
+ import ohmyscrapper.modules.load_txt as load_txt
4
+ import ohmyscrapper.modules.classify_urls as classify_urls
5
+
6
+ import time
7
+ import random
8
+
9
+
10
+ def process_linkedin_redirect(url_report, url):
11
+ print("linkedin_redirect")
12
+
13
+ if url_report["total-a-links"] < 5:
14
+ if "first-a-link" in url_report.keys():
15
+ url_destiny = url_report["first-a-link"]
16
+ else:
17
+ urls_manager.set_url_error(url["url"], "error: no first-a-link")
18
+ print("no url for:", url["url"])
19
+ return
20
+ else:
21
+ if "og:url" in url_report.keys():
22
+ url_destiny = url_report["og:url"]
23
+ else:
24
+ urls_manager.set_url_error(url["url"], "error: no og:url")
25
+ print("no url for:", url["url"])
26
+ return
27
+
28
+ print(url["url"], ">>", url_destiny)
29
+ urls_manager.add_url(url_destiny)
30
+ urls_manager.set_url_destiny(url["url"], url_destiny)
31
+
32
+
33
+ def process_linkedin_feed(url_report, url):
34
+ print("linkedin_feed")
35
+
36
+ if "og:url" in url_report.keys():
37
+ url_destiny = url_report["og:url"]
38
+ else:
39
+ urls_manager.set_url_error(url["url"], "error: no og:url")
40
+ print("no url for:", url["url"])
41
+ return
42
+
43
+ print(url["url"], ">>", url_destiny)
44
+ urls_manager.add_url(url_destiny)
45
+ urls_manager.set_url_destiny(url["url"], url_destiny)
46
+
47
+
48
+ def process_linkedin_job(url_report, url):
49
+ print("linkedin_job")
50
+ changed = False
51
+ if "h1" in url_report.keys():
52
+ print(url["url"], ": ", url_report["h1"])
53
+ urls_manager.set_url_h1(url["url"], url_report["h1"])
54
+ changed = True
55
+ elif "og:title" in url_report.keys():
56
+ print(url["url"], ": ", url_report["og:title"])
57
+ urls_manager.set_url_h1(url["url"], url_report["og:title"])
58
+ changed = True
59
+
60
+ if "description" in url_report.keys():
61
+ urls_manager.set_url_description(url["url"], url_report["description"])
62
+ changed = True
63
+ elif "og:description" in url_report.keys():
64
+ urls_manager.set_url_description(url["url"], url_report["og:description"])
65
+ changed = True
66
+ if not changed:
67
+ urls_manager.set_url_error(url["url"], "error: no h1 or description")
68
+
69
+
70
+ def process_linkedin_post(url_report, url):
71
+ print("linkedin_post or generic")
72
+ print(url["url"])
73
+ changed = False
74
+ if "h1" in url_report.keys():
75
+ print(url["url"], ": ", url_report["h1"])
76
+ urls_manager.set_url_h1(url["url"], url_report["h1"])
77
+ changed = True
78
+ elif "og:title" in url_report.keys():
79
+ urls_manager.set_url_h1(url["url"], url_report["og:title"])
80
+ changed = True
81
+ description = None
82
+ if "description" in url_report.keys():
83
+ description = url_report["description"]
84
+ changed = True
85
+ elif "og:description" in url_report.keys():
86
+ description = url_report["og:description"]
87
+ changed = True
88
+
89
+ if description is not None:
90
+ urls_manager.set_url_description(url["url"], description)
91
+ description_links = load_txt.put_urls_from_string(description, url["id"])
92
+ urls_manager.set_url_description_links(url["url"], description_links)
93
+
94
+ if not changed:
95
+ urls_manager.set_url_error(url["url"], "error: no h1 or description")
96
+
97
+
98
+ def scrap_url(url):
99
+ # TODO: Use get_urls_valid_prefix_by_id()
100
+ df = urls_manager.get_urls_valid_prefix()
101
+ url_valid_prefixes = df.set_index(df.id).T.to_dict()
102
+
103
+ # TODO: Need to change this
104
+
105
+ if url["url_type"] is None:
106
+ print("\n\ngeneric:", url["url"])
107
+ url["url_type"] = "generic"
108
+ else:
109
+ print("\n\n", url["url_type"] + ":", url["url"])
110
+ try:
111
+ url_report = sniff_url.get_tags(url["url"])
112
+ except Exception as e:
113
+ urls_manager.set_url_error(url["url"], "error")
114
+ urls_manager.touch_url(url["url"])
115
+ print("\n\n!!! ERROR FOR:", url["url"])
116
+ print(
117
+ "\n\n!!! you can check the URL using the command sniff-url",
118
+ url["url"],
119
+ "\n\n",
120
+ )
121
+ return
122
+
123
+ # linkedin_redirect - linkedin (https://lnkd.in/)
124
+ if url["url_type"] == "linkedin_redirect":
125
+ process_linkedin_redirect(url_report, url)
126
+
127
+ # linkedin_feed - linkedin (https://%.linkedin.com/feed/)
128
+ if url["url_type"] == "linkedin_feed":
129
+ process_linkedin_feed(url_report, url)
130
+
131
+ # linkedin_job - linkedin (https://www.linkedin.com/jobs/)
132
+ if url["url_type"] == "linkedin_job":
133
+ process_linkedin_job(url_report, url)
134
+
135
+ # linkedin_job - linkedin (https://www.linkedin.com/jobs/)
136
+ if url["url_type"] == "linkedin_post" or url["url_type"] == "generic":
137
+ process_linkedin_post(url_report, url)
138
+
139
+ urls_manager.set_url_json(url["url"], url_report["json"])
140
+ urls_manager.touch_url(url["url"])
141
+
142
+
143
+ def isNaN(num):
144
+ return num != num
145
+
146
+
147
+ def scrap_urls(
148
+ recursive=False, ignore_valid_prefix=False, randomize=False, only_parents=True
149
+ ):
150
+ classify_urls.classify_urls()
151
+ urls = urls_manager.get_untouched_urls(
152
+ ignore_valid_prefix=ignore_valid_prefix,
153
+ randomize=randomize,
154
+ only_parents=only_parents,
155
+ )
156
+ if len(urls) == 0:
157
+ print("no urls to scrap")
158
+ return
159
+ for index, url in urls.iterrows():
160
+ scrap_url(url)
161
+
162
+ wait = random.randint(15, 20)
163
+ wait = random.randint(1, 3)
164
+ print("sleeping for", wait, "seconds")
165
+ time.sleep(wait)
166
+
167
+ classify_urls.classify_urls()
168
+ if recursive:
169
+ wait = random.randint(5, 10)
170
+ print("sleeping for", wait, "seconds before next round")
171
+ time.sleep(wait)
172
+ scrap_urls(
173
+ recursive=recursive,
174
+ ignore_valid_prefix=ignore_valid_prefix,
175
+ randomize=randomize,
176
+ only_parents=only_parents,
177
+ )
178
+ else:
179
+ print("ending...")
@@ -0,0 +1,7 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+
3
+
4
+ def seed():
5
+ urls_manager.seeds()
6
+ print("db seeded")
7
+ return
@@ -0,0 +1,116 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+ import math
3
+ from rich.console import Console
4
+ from rich.table import Table
5
+
6
+
7
+ def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
8
+ df = urls_manager.get_urls(limit)
9
+
10
+ if simplify:
11
+ df.drop(columns=["description", "json"], inplace=True)
12
+
13
+ df.to_csv(csv_file, index=False)
14
+ print("--------------------")
15
+ print("Urls exported to", csv_file)
16
+
17
+ df.replace(
18
+ {
19
+ "description": {r"\n": " "},
20
+ },
21
+ regex=True,
22
+ inplace=True,
23
+ )
24
+ df.to_html(csv_file + "-preview.html", index=False)
25
+ print("Urls preview exported to", csv_file + "-preview.html")
26
+ print("--------------------")
27
+
28
+
29
+ def export_report(csv_file="output/report.csv"):
30
+ df = urls_manager.get_urls_report()
31
+
32
+ df.to_csv(csv_file, index=False)
33
+ print("--------------------")
34
+ print("Urls report exported to", csv_file)
35
+
36
+ df.replace(
37
+ {
38
+ "description": {r"\n": " "},
39
+ },
40
+ regex=True,
41
+ inplace=True,
42
+ )
43
+ df.to_html(csv_file + "-preview.html", index=False)
44
+ print("Urls report preview exported to", csv_file + "-preview.html")
45
+ print("--------------------")
46
+
47
+
48
+ def show_urls(limit=0, jump_to_page=0):
49
+ df = urls_manager.get_urls(limit)
50
+ df.drop(columns=["json", "description"], inplace=True)
51
+ # df = df.head(n=20)
52
+
53
+ # https://medium.com/@inzaniak/create-tables-in-your-terminal-with-python-6747d68d71a6
54
+
55
+ total_items = len(df)
56
+ items_per_page = 15
57
+ n_pages = math.ceil(total_items / items_per_page)
58
+
59
+ last_popped = 0
60
+ for page in range(n_pages):
61
+
62
+ df_page = df.head(n=items_per_page)
63
+ df_t = df.T
64
+ for i in range(items_per_page):
65
+ if last_popped < total_items:
66
+ df_t.pop(last_popped)
67
+ last_popped += 1
68
+ df = df_t.T
69
+ if page < jump_to_page:
70
+ continue
71
+ show_table(df_page)
72
+
73
+ print("Page", page + 1, "of", n_pages)
74
+ user_input = input("Press enter to continue or type q to quit: ")
75
+ if user_input == "q":
76
+ break
77
+ if user_input.isnumeric():
78
+ jump_to_page = math.ceil(int(user_input))
79
+ if jump_to_page > n_pages or jump_to_page < 1:
80
+ print("This page does not exist")
81
+ jump_to_page = 0
82
+ else:
83
+ jump_to_page = jump_to_page - 1
84
+ if page < jump_to_page:
85
+ continue
86
+ elif jump_to_page >= 0:
87
+ show_urls(limit=limit, jump_to_page=jump_to_page)
88
+ break
89
+
90
+ return
91
+
92
+ return
93
+
94
+
95
+ # TODO: Change place
96
+ def show_table(df):
97
+ columns = df.columns.tolist()
98
+ df = df.to_dict(orient="records")
99
+ table = Table(show_header=True, header_style="bold magenta")
100
+ for column in columns:
101
+ table.add_column(column)
102
+
103
+ for row in df:
104
+ table.add_row(*[str(value) for value in row.values()])
105
+ console = Console()
106
+ console.print(table)
107
+
108
+
109
+ def show_urls_valid_prefix(limit=0):
110
+ print(urls_manager.get_urls_valid_prefix(limit))
111
+ return
112
+
113
+
114
+ def show_url(url):
115
+ print(urls_manager.get_url_by_url(url).T)
116
+ return
@@ -0,0 +1,88 @@
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+
5
+
6
+ def sniff_url(url="https://www.linkedin.com/in/cesardesouzacardoso/", silent=False):
7
+ if not silent:
8
+ print("checking url:", url)
9
+ report_meta_tags = []
10
+ tags_to_search = [
11
+ "description",
12
+ "og:url",
13
+ "og:title",
14
+ "og:description",
15
+ "og:type",
16
+ "lnkd:url",
17
+ ]
18
+
19
+ r = requests.get(url=url)
20
+ soup = BeautifulSoup(r.text, "html.parser")
21
+
22
+ if not silent:
23
+ print("\n\n\n\n---- all <meta> tags ---\n")
24
+ i = 0
25
+ for meta_tag in soup.find_all("meta"):
26
+ if (
27
+ meta_tag.get("name") in tags_to_search
28
+ or meta_tag.get("property") in tags_to_search
29
+ ):
30
+ report_meta_tags.append(meta_tag)
31
+ i = i + 1
32
+ if not silent:
33
+ print("-- meta tag", i, "--")
34
+ print("name:", meta_tag.get("name"))
35
+ print("property:", meta_tag.get("property"))
36
+ print("content:", meta_tag.get("content"))
37
+ print("---------------- \n")
38
+
39
+ if not silent:
40
+ print("\n\n\n\n---- all <a> links ---")
41
+ i = 0
42
+ for a_tag in soup.find_all("a"):
43
+ i = i + 1
44
+ print("\n-- a link", i, "-- ")
45
+ print("target:", a_tag.get("target"))
46
+ print("text:", a_tag.text)
47
+ print("href:", a_tag.get("href"))
48
+ print("-------------- ")
49
+
50
+ final_report = {}
51
+ final_report["scrapped-url"] = url
52
+ if len(soup.find_all("h1")) > 0:
53
+ final_report["h1"] = soup.find("h1").text
54
+
55
+ for report_meta_tag in report_meta_tags:
56
+ if report_meta_tag.get("name") is not None:
57
+ final_report[report_meta_tag.get("name")] = report_meta_tag.get("content")
58
+ elif report_meta_tag.get("property") is not None:
59
+ final_report[report_meta_tag.get("property")] = report_meta_tag.get(
60
+ "content"
61
+ )
62
+
63
+ if len(soup.find_all("a")) > 0:
64
+ final_report["first-a-link"] = soup.find("a").get("href")
65
+ final_report["total-a-links"] = len(soup.find_all("a"))
66
+ else:
67
+ final_report["first-a-link"] = ""
68
+ final_report["total-a-links"] = 0
69
+
70
+ if len(soup.find_all("h2")) > 0:
71
+ final_report["h2"] = soup.find("h2").text
72
+
73
+ if len(soup.find_all("meta")) > 0:
74
+ final_report["total-meta-tags"] = len(soup.find_all("meta"))
75
+ else:
76
+ final_report["total-meta-tags"] = 0
77
+
78
+ final_report["json"] = json.dumps(final_report)
79
+ if not silent:
80
+ print("\n\n\n----report---\n")
81
+ for key in final_report:
82
+ print("* ", key, ":", final_report[key])
83
+
84
+ return final_report
85
+
86
+
87
+ def get_tags(url):
88
+ return sniff_url(url, silent=True)
@@ -0,0 +1,7 @@
1
+ import ohmyscrapper.models.urls_manager as urls_manager
2
+
3
+
4
+ def untouch_all():
5
+ urls_manager.untouch_all_urls()
6
+ print("urls have been untouched")
7
+ return
@@ -0,0 +1,109 @@
1
+ Metadata-Version: 2.3
2
+ Name: ohmyscrapper
3
+ Version: 0.1.1
4
+ Summary: This project aims to create a text-based scraper containing links to create a final PDF with general information about job openings.
5
+ Author: Cesar Cardoso gh@bouli
6
+ Author-email: Cesar Cardoso gh@bouli <hello@cesarcardoso.cc>
7
+ Requires-Dist: beautifulsoup4>=4.14.3
8
+ Requires-Dist: google-genai>=1.55.0
9
+ Requires-Dist: pandas>=2.3.3
10
+ Requires-Dist: python-dotenv>=1.2.1
11
+ Requires-Dist: pyyaml>=6.0.3
12
+ Requires-Dist: requests>=2.32.5
13
+ Requires-Dist: rich>=14.2.0
14
+ Requires-Dist: urlextract>=1.9.0
15
+ Requires-Python: >=3.11
16
+ Description-Content-Type: text/markdown
17
+
18
+ # OhMyScrapper - v0.1.1
19
+
20
+ This project aims to create a text-based scraper containing links to create a
21
+ final PDF with general information about job openings.
22
+
23
+ > This project is using [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer) by default.
24
+
25
+ ## Scope
26
+
27
+ - Read texts;
28
+ - Extract links;
29
+ - Use meta og:tags to extract information;
30
+
31
+ ## Installation
32
+
33
+ I recomend to use the [uv](https://docs.astral.sh/uv/getting-started/installation/#standalone-installer), so you can just use the command bellow and everything is installed:
34
+ ```shell
35
+ uv sync
36
+ ```
37
+
38
+ ## How to use and test (development only)
39
+
40
+ OhMyScrapper works in 3 stages:
41
+
42
+ 1. It collects and loads urls from a text (by default `input/_chat.txt`) in a database;
43
+ 2. It scraps/access the collected urls and read what is relevant. If it finds new urls, they are collected as well;
44
+ 3. Export a list of urls in CSV files;
45
+
46
+ You can do 3 stages with the command:
47
+ ```shell
48
+ make start
49
+ ```
50
+ > Remember to add your text file in the folder `/input` with the name `_chat.txt`!
51
+
52
+ You will find the exported files in the folder `/output` like this:
53
+ - `/output/report.csv`
54
+ - `/output/report.csv-preview.html`
55
+ - `/output/urls-simplified.csv`
56
+ - `/output/urls-simplified.csv-preview.html`
57
+ - `/output/urls.csv`
58
+ - `/output/urls.csv-preview.html`
59
+
60
+ ### BUT: if you want to do step by step, here it is:
61
+
62
+ First we load a text file you would like to look for urls, the idea here is to
63
+ use the whatsapp history, but it works with any txt file.
64
+
65
+ The default file is `input/_chat.txt`. If you have the default file you just use
66
+ the command `load`:
67
+ ```shell
68
+ make load
69
+ ```
70
+ or, if you have another file, just use the argument `-file` like this:
71
+ ```shell
72
+ uv run main.py load -file=my-text-file.txt
73
+ ```
74
+ That will create a database if it doesn't exist and store every url the oh-my-scrapper
75
+ find. After that, let's scrap the urls with the command `scrap-urls`:
76
+
77
+ ```shell
78
+ make scrap-urls
79
+ ```
80
+
81
+ That will scrap only the linkedin urls we are interested in. For now they are:
82
+ - linkedin_post: https://%.linkedin.com/posts/%
83
+ - linkedin_redirect: https://lnkd.in/%
84
+ - linkedin_job: https://%.linkedin.com/jobs/view/%
85
+ - linkedin_feed" https://%.linkedin.com/feed/%
86
+ - linkedin_company: https://%.linkedin.com/company/%
87
+
88
+ But we can use every other one generically using the argument `--ignore-type`:
89
+ ```shell
90
+ uv run main.py scrap-urls --ignore-type
91
+ ```
92
+
93
+ And we can ask to make it recursively adding the argument `--recursive`:
94
+ ```shell
95
+ uv run main.py scrap-urls --recursive
96
+ ```
97
+ > !!! important: we are not sure about blocks we can have for excess of requests
98
+
99
+ And we can finally export with the command:
100
+ ```shell
101
+ make export
102
+ ```
103
+
104
+
105
+ That's the basic usage!
106
+ But you can understand more using the help:
107
+ ```shell
108
+ uv run main.py --help
109
+ ```
@@ -0,0 +1,16 @@
1
+ ohmyscrapper/__init__.py,sha256=ob4q1EP5nxpOA1uMW2MdyPQpgCGziGDjkJOAKien_i8,4917
2
+ ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
+ ohmyscrapper/models/urls_manager.py,sha256=nEGjd1SInCz7ccCuFT5Qh5GRoBejfVogYVHKgwIDCbI,10504
4
+ ohmyscrapper/modules/classify_urls.py,sha256=LRYCD2Rmc7vz65uI9IjmFLyB5M-5fmyC5o-ZJZhmqOc,711
5
+ ohmyscrapper/modules/load_txt.py,sha256=VycWtOM4BNFSYGqBLr5_7hAFjO3Fzyeb_xpDvWTWxaw,866
6
+ ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
7
+ ohmyscrapper/modules/process_with_ai.py,sha256=oXeEth4vdzaQQoi3au8lDZ3uIJc0mQ_CF92w4CFckNk,5143
8
+ ohmyscrapper/modules/scrap_urls.py,sha256=5Uk3dze6e4dRGlmOsyJ_A6Jat_L6vIVURtfll2hWGdw,5872
9
+ ohmyscrapper/modules/seed.py,sha256=KeTSbmTdNTkVCtzk9iQmeuEqB0kG-rTZJb2a1WdROL4,129
10
+ ohmyscrapper/modules/show.py,sha256=5VkDfFXWBik2YHk7xuneJCnndAroUHDAlWGRRNSSkYc,3194
11
+ ohmyscrapper/modules/sniff_url.py,sha256=hsMd-mEZPPOqiyDCc-QW_ThJ1GIAL7vixiX3-fLO9AU,2716
12
+ ohmyscrapper/modules/untouch_all.py,sha256=E1U9e3sOG7suzc8ZTWcYiQQo9mPmLJ0piXdXUjFLEd4,162
13
+ ohmyscrapper-0.1.1.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
14
+ ohmyscrapper-0.1.1.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
15
+ ohmyscrapper-0.1.1.dist-info/METADATA,sha256=CBiGxmdZdR37mF1xh8CfRa7PQaII804RzcCGHLsGUlM,3381
16
+ ohmyscrapper-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.9.17
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ ohmyscrapper = ohmyscrapper:main
3
+