ohmyscrapper 0.2.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,31 +4,45 @@ import time
4
4
  import glob
5
5
  import pandas as pd
6
6
  from urllib.parse import urlparse, urlunparse
7
+ from ohmyscrapper.core import config
7
8
 
8
9
 
9
10
  def get_db_dir():
10
- if not os.path.exists("db"):
11
- os.mkdir("db")
12
- return "db"
11
+ db_folder = config.get_dir("db")
12
+ if not os.path.exists(db_folder):
13
+ os.mkdir(db_folder)
14
+ return db_folder
13
15
 
14
16
 
15
17
  def get_db_path():
16
- return get_db_dir() + "/local.db"
18
+ db_file = config.get_db()
19
+ return os.path.join(get_db_dir(), db_file)
17
20
 
18
21
 
19
22
  def get_db_connection():
23
+ if not os.path.exists(get_db_path()):
24
+ create_tables(sqlite3.connect(get_db_path()))
20
25
  return sqlite3.connect(get_db_path())
21
26
 
22
27
 
23
- # TODO: check if it makes sense
24
- conn = get_db_connection()
28
+ def use_connection(func):
29
+ def provide_connection(*args, **kwargs):
30
+ global conn
31
+ with get_db_connection() as conn:
32
+ try:
33
+ return func(*args, **kwargs)
34
+ except:
35
+ update_db()
36
+ return func(*args, **kwargs)
25
37
 
38
+ return provide_connection
26
39
 
27
- def create_tables():
40
+
41
+ def create_tables(conn):
28
42
 
29
43
  c = conn.cursor()
30
44
  c.execute(
31
- "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, h1 TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
45
+ "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, title TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
32
46
  )
33
47
  c.execute(
34
48
  "CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, prompt_file STRING, prompt_name STRING, created_at DATETIME)"
@@ -38,27 +52,25 @@ def create_tables():
38
52
  "CREATE TABLE IF NOT EXISTS urls_valid_prefix (id INTEGER PRIMARY KEY, url_prefix TEXT UNIQUE, url_type TEXT)"
39
53
  )
40
54
 
41
- return pd.read_sql_query("SELECT * FROM urls LIMIT 100", conn)
42
55
 
56
+ def update_db():
57
+ try:
58
+ c = conn.cursor()
59
+ c.execute("ALTER TABLE urls RENAME COLUMN h1 TO title")
60
+ except:
61
+ pass
43
62
 
44
- # TODO: not sure this should be something. depends on the project
45
- def seeds():
46
- create_tables()
47
63
 
48
- add_urls_valid_prefix("https://%.linkedin.com/posts/%", "linkedin_post")
49
- add_urls_valid_prefix("https://lnkd.in/%", "linkedin_redirect")
50
- add_urls_valid_prefix("https://%.linkedin.com/jobs/view/%", "linkedin_job")
51
- add_urls_valid_prefix("https://%.linkedin.com/feed/%", "linkedin_feed")
52
- add_urls_valid_prefix("https://%.linkedin.com/company/%", "linkedin_company")
64
+ def seeds(seeds={}):
53
65
 
54
- # add_urls_valid_prefix("%.pdf", "pdf")
55
- # add_url('https://imazon.org.br/categorias/artigos-cientificos/')
66
+ for url_type, url_prefix in seeds.items():
67
+ add_urls_valid_prefix(url_prefix, url_type)
56
68
 
57
69
  return True
58
70
 
59
71
 
72
+ @use_connection
60
73
  def add_urls_valid_prefix(url_prefix, url_type):
61
- conn = get_db_connection()
62
74
 
63
75
  df = pd.read_sql_query(
64
76
  f"SELECT * FROM urls_valid_prefix WHERE url_prefix = '{url_prefix}'", conn
@@ -72,6 +84,7 @@ def add_urls_valid_prefix(url_prefix, url_type):
72
84
  conn.commit()
73
85
 
74
86
 
87
+ @use_connection
75
88
  def get_urls_valid_prefix_by_type(url_type):
76
89
  df = pd.read_sql_query(
77
90
  f"SELECT * FROM urls_valid_prefix WHERE url_type = '{url_type}'", conn
@@ -79,12 +92,14 @@ def get_urls_valid_prefix_by_type(url_type):
79
92
  return df
80
93
 
81
94
 
95
+ @use_connection
82
96
  def get_urls_valid_prefix_by_id(id):
83
97
  df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix WHERE id = '{id}'", conn)
84
98
  return df
85
99
 
86
100
 
87
101
  # TODO: pagination required
102
+ @use_connection
88
103
  def get_urls_valid_prefix(limit=0):
89
104
  if limit > 0:
90
105
  df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix LIMIT {limit}", conn)
@@ -94,6 +109,7 @@ def get_urls_valid_prefix(limit=0):
94
109
 
95
110
 
96
111
  # TODO: pagination required
112
+ @use_connection
97
113
  def get_urls(limit=0):
98
114
  if limit > 0:
99
115
  df = pd.read_sql_query(
@@ -104,6 +120,7 @@ def get_urls(limit=0):
104
120
  return df
105
121
 
106
122
 
123
+ @use_connection
107
124
  def get_urls_report():
108
125
  sql = """
109
126
  WITH parent_url AS (
@@ -113,7 +130,7 @@ def get_urls_report():
113
130
  SELECT
114
131
  u.id,
115
132
  u.url,
116
- u.h1
133
+ u.title
117
134
  FROM urls u
118
135
  INNER JOIN parent_url p
119
136
  ON u.url = p.parent_url
@@ -122,9 +139,9 @@ def get_urls_report():
122
139
  u.id,
123
140
  u.url_type,
124
141
  u.url,
125
- COALESCE(u.h1, p.h1) as h1,
142
+ COALESCE(u.title, p.title) as title,
126
143
  p.url as parent_url,
127
- p.h1 as parent_h1
144
+ p.title as parent_title
128
145
  FROM urls u
129
146
  LEFT JOIN parents p
130
147
  ON u.parent_url = p.url
@@ -138,6 +155,7 @@ def get_urls_report():
138
155
  return df
139
156
 
140
157
 
158
+ @use_connection
141
159
  def get_url_by_url(url):
142
160
  url = clean_url(url)
143
161
  df = pd.read_sql_query(f"SELECT * FROM urls WHERE url = '{url}'", conn)
@@ -145,12 +163,14 @@ def get_url_by_url(url):
145
163
  return df
146
164
 
147
165
 
166
+ @use_connection
148
167
  def get_url_by_id(id):
149
168
  df = pd.read_sql_query(f"SELECT * FROM urls WHERE id = '{id}'", conn)
150
169
 
151
170
  return df
152
171
 
153
172
 
173
+ @use_connection
154
174
  def get_urls_by_url_type(url_type):
155
175
  df = pd.read_sql_query(
156
176
  f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}'", conn
@@ -158,6 +178,7 @@ def get_urls_by_url_type(url_type):
158
178
  return df
159
179
 
160
180
 
181
+ @use_connection
161
182
  def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
162
183
  df = pd.read_sql_query(
163
184
  f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}' AND ai_processed = 0 LIMIT {limit}",
@@ -166,6 +187,7 @@ def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
166
187
  return df
167
188
 
168
189
 
190
+ @use_connection
169
191
  def get_url_like_unclassified(like_condition):
170
192
  df = pd.read_sql_query(
171
193
  f"SELECT * FROM urls WHERE history = 0 AND url LIKE '{like_condition}' AND url_type IS NULL",
@@ -174,12 +196,13 @@ def get_url_like_unclassified(like_condition):
174
196
  return df
175
197
 
176
198
 
177
- def add_url(url, h1=None, parent_url=None):
199
+ @use_connection
200
+ def add_url(url, title=None, parent_url=None):
178
201
  url = clean_url(url)
179
202
  c = conn.cursor()
180
203
 
181
- if h1 is not None:
182
- h1 = h1.strip()
204
+ if title is not None:
205
+ title = title.strip()
183
206
 
184
207
  if parent_url is None:
185
208
  parent_url = None
@@ -188,14 +211,15 @@ def add_url(url, h1=None, parent_url=None):
188
211
 
189
212
  if len(get_url_by_url(url)) == 0:
190
213
  c.execute(
191
- "INSERT INTO urls (url, h1, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
192
- (url, h1, parent_url, int(time.time())),
214
+ "INSERT INTO urls (url, title, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
215
+ (url, title, parent_url, int(time.time())),
193
216
  )
194
217
  conn.commit()
195
218
 
196
219
  return get_url_by_url(url)
197
220
 
198
221
 
222
+ @use_connection
199
223
  def add_ai_log(instructions, response, model, prompt_file, prompt_name):
200
224
  c = conn.cursor()
201
225
 
@@ -205,10 +229,14 @@ def add_ai_log(instructions, response, model, prompt_file, prompt_name):
205
229
  )
206
230
  conn.commit()
207
231
 
232
+
233
+ @use_connection
208
234
  def get_ai_log():
209
235
  df = pd.read_sql_query(f"SELECT * FROM ai_log", conn)
210
236
  return df
211
237
 
238
+
239
+ @use_connection
212
240
  def set_url_destiny(url, destiny):
213
241
  url = clean_url(url)
214
242
  destiny = clean_url(destiny)
@@ -222,45 +250,62 @@ def set_url_destiny(url, destiny):
222
250
  conn.commit()
223
251
 
224
252
 
225
- def set_url_h1(url, value):
253
+ @use_connection
254
+ def set_url_title(url, value):
226
255
  value = str(value).strip()
227
256
  url = clean_url(url)
228
257
  c = conn.cursor()
229
- c.execute("UPDATE urls SET h1 = ? WHERE url = ?", (value, url))
258
+ c.execute("UPDATE urls SET title = ? WHERE url = ?", (value, url))
230
259
  conn.commit()
231
260
 
232
261
 
233
- def set_url_h1_by_id(id, value):
262
+ @use_connection
263
+ def set_url_title_by_id(id, value):
234
264
  value = str(value).strip()
235
265
 
236
266
  c = conn.cursor()
237
- c.execute("UPDATE urls SET h1 = ? WHERE id = ?", (value, id))
267
+ c.execute("UPDATE urls SET title = ? WHERE id = ?", (value, id))
238
268
  conn.commit()
239
269
 
240
270
 
271
+ @use_connection
241
272
  def set_url_ai_processed_by_id(id, json_str):
242
273
  value = 1
243
274
  value = str(value).strip()
244
275
  c = conn.cursor()
245
- c.execute("UPDATE urls SET ai_processed = ? , json_ai = ? WHERE id = ?", (value, json_str, id))
276
+ c.execute(
277
+ "UPDATE urls SET ai_processed = ? , json_ai = ? WHERE id = ?",
278
+ (value, json_str, id),
279
+ )
246
280
  conn.commit()
247
281
 
282
+
283
+ @use_connection
248
284
  def set_url_empty_ai_processed_by_id(id, json_str="empty result"):
249
285
  value = 1
250
286
  value = str(value).strip()
251
287
  c = conn.cursor()
252
- c.execute("UPDATE urls SET ai_processed = ? , json_ai = ? WHERE ai_processed = 0 AND id = ?", (value, json_str, id))
288
+ c.execute(
289
+ "UPDATE urls SET ai_processed = ? , json_ai = ? WHERE ai_processed = 0 AND id = ?",
290
+ (value, json_str, id),
291
+ )
253
292
  conn.commit()
254
293
 
294
+
295
+ @use_connection
255
296
  def set_url_ai_processed_by_url(url, json_str):
256
297
  value = 1
257
298
  value = str(value).strip()
258
299
  url = clean_url(url)
259
300
  c = conn.cursor()
260
- c.execute("UPDATE urls SET ai_processed = ?, json_ai = ? WHERE url = ?", (value, json_str, url))
301
+ c.execute(
302
+ "UPDATE urls SET ai_processed = ?, json_ai = ? WHERE url = ?",
303
+ (value, json_str, url),
304
+ )
261
305
  conn.commit()
262
306
 
263
307
 
308
+ @use_connection
264
309
  def set_url_description(url, value):
265
310
  url = clean_url(url)
266
311
  c = conn.cursor()
@@ -268,6 +313,7 @@ def set_url_description(url, value):
268
313
  conn.commit()
269
314
 
270
315
 
316
+ @use_connection
271
317
  def set_url_description_links(url, value):
272
318
  url = clean_url(url)
273
319
  c = conn.cursor()
@@ -275,6 +321,7 @@ def set_url_description_links(url, value):
275
321
  conn.commit()
276
322
 
277
323
 
324
+ @use_connection
278
325
  def set_url_json(url, value):
279
326
  url = clean_url(url)
280
327
  c = conn.cursor()
@@ -282,6 +329,7 @@ def set_url_json(url, value):
282
329
  conn.commit()
283
330
 
284
331
 
332
+ @use_connection
285
333
  def set_url_error(url, value):
286
334
  url = clean_url(url)
287
335
  c = conn.cursor()
@@ -289,6 +337,7 @@ def set_url_error(url, value):
289
337
  conn.commit()
290
338
 
291
339
 
340
+ @use_connection
292
341
  def set_url_type_by_id(url_id, url_type):
293
342
  c = conn.cursor()
294
343
  c.execute(f"UPDATE urls SET url_type = '{url_type}' WHERE id = {url_id}")
@@ -312,6 +361,7 @@ def clean_url(url):
312
361
  return url
313
362
 
314
363
 
364
+ @use_connection
315
365
  def get_untouched_urls(
316
366
  limit=10, randomize=True, ignore_valid_prefix=False, only_parents=True
317
367
  ):
@@ -331,6 +381,7 @@ def get_untouched_urls(
331
381
  return df
332
382
 
333
383
 
384
+ @use_connection
334
385
  def touch_url(url):
335
386
  url = clean_url(url)
336
387
  c = conn.cursor()
@@ -338,6 +389,7 @@ def touch_url(url):
338
389
  conn.commit()
339
390
 
340
391
 
392
+ @use_connection
341
393
  def untouch_url(url):
342
394
  url = clean_url(url)
343
395
  c = conn.cursor()
@@ -345,12 +397,14 @@ def untouch_url(url):
345
397
  conn.commit()
346
398
 
347
399
 
400
+ @use_connection
348
401
  def untouch_all_urls():
349
402
  c = conn.cursor()
350
403
  c.execute("UPDATE urls SET last_touch = NULL WHERE history = 0")
351
404
  conn.commit()
352
405
 
353
406
 
407
+ @use_connection
354
408
  def set_all_urls_as_history():
355
409
  c = conn.cursor()
356
410
  c.execute("UPDATE urls SET history = 1")
@@ -382,19 +436,19 @@ def merge_dbs() -> None:
382
436
  row["description"],
383
437
  row["json"],
384
438
  )
385
- # ßmerge_url(df)
386
439
 
387
440
 
388
- def merge_url(url, h1, last_touch, created_at, description, json):
441
+ @use_connection
442
+ def merge_url(url, title, last_touch, created_at, description, json):
389
443
  url = clean_url(url)
390
444
  c = conn.cursor()
391
445
 
392
- if h1 is not None:
393
- h1 = h1.strip()
446
+ if title is not None:
447
+ title = title.strip()
394
448
 
395
449
  if len(get_url_by_url(url)) == 0:
396
450
  c.execute(
397
- "INSERT INTO urls (url, h1, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
398
- (url, h1, last_touch, created_at, description, json),
451
+ "INSERT INTO urls (url, title, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
452
+ (url, title, last_touch, created_at, description, json),
399
453
  )
400
454
  conn.commit()
@@ -1,23 +1,31 @@
1
1
  import ohmyscrapper.models.urls_manager as urls_manager
2
+ from ohmyscrapper.modules import seed
2
3
  import pandas as pd
3
4
  import time
4
5
 
5
6
 
6
7
  def classify_urls(recursive=False):
7
- urls_manager.seeds()
8
8
  df = urls_manager.get_urls_valid_prefix()
9
+ if len(df) == 0:
10
+ seed.seed()
11
+ classify_urls(recursive=recursive)
12
+ return
9
13
 
10
14
  keep_alive = True
11
15
  while keep_alive:
12
- print("waking up!")
16
+ print("#️⃣ URL Classifier woke up to classify urls!")
13
17
  for index, row_prefix in df.iterrows():
14
- df_urls = urls_manager.get_url_like_unclassified(like_condition=row_prefix["url_prefix"])
18
+ df_urls = urls_manager.get_url_like_unclassified(
19
+ like_condition=row_prefix["url_prefix"]
20
+ )
15
21
  for index, row_urls in df_urls.iterrows():
16
- urls_manager.set_url_type_by_id(url_id =row_urls["id"], url_type=row_prefix["url_type"])
22
+ urls_manager.set_url_type_by_id(
23
+ url_id=row_urls["id"], url_type=row_prefix["url_type"]
24
+ )
17
25
 
18
26
  if not recursive:
19
- print("ending...")
27
+ print("#️⃣ URL Classifier said: I'm done! See you soon...")
20
28
  keep_alive = False
21
29
  else:
22
- print("sleeping...")
30
+ print("#️⃣ URL Classifier is taking a nap...")
23
31
  time.sleep(10)
@@ -1,31 +1,99 @@
1
1
  import os
2
2
  from urlextract import URLExtract
3
3
  import ohmyscrapper.models.urls_manager as urls_manager
4
+ from ohmyscrapper.core import config
4
5
 
5
6
 
6
- def load_txt(file_name="input/_chat.txt"):
7
+ def _increment_file_name(text_file_content, file_name):
8
+ print(f"reading and loading file `{file_name}`... ")
9
+ with open(file_name, "r") as f:
10
+ return text_file_content + f.read()
7
11
 
8
- if not os.path.exists("input"):
9
- os.mkdir("input")
10
12
 
11
- urls_manager.create_tables()
13
+ def load_txt(file_name="input", verbose=False):
14
+ input_folder = config.get_dir("input")
15
+ if not os.path.exists(input_folder):
16
+ os.mkdir(input_folder)
17
+
12
18
  urls_manager.seeds()
13
- # make it recursive for all files
14
- text_file_content = open(file_name, "r").read()
15
19
 
16
- put_urls_from_string(text_to_process=text_file_content)
20
+ text_file_content = ""
21
+ if file_name is not None and not os.path.isdir(file_name):
22
+ print(f"📖 reading file `{file_name}`... ")
23
+ if not os.path.exists(file_name):
24
+ if file_name.startswith("https://") or file_name.startswith("http://"):
25
+ text_file_content = " " + file_name + " "
26
+ else:
27
+ print(f"\n file `{file_name}` not found.")
28
+ return
29
+ else:
30
+ text_file_content = _increment_file_name(
31
+ text_file_content=text_file_content, file_name=file_name
32
+ )
33
+ else:
34
+ input_folder = config.get_dir("input")
35
+ print(f"📂 reading {input_folder} directory... ")
36
+ if file_name is None:
37
+ dir_files = input_folder
38
+ else:
39
+ dir_files = file_name
40
+ text_files = os.listdir(dir_files)
41
+ for file in text_files:
42
+ if not file.endswith(".txt"):
43
+ text_files.remove(file)
44
+ if len(text_files) == 0:
45
+ print(f"No text files found in {input_folder} directory!")
46
+ return
47
+ elif len(text_files) == 1:
48
+ print(f"📖 reading file `{dir_files}/{text_files[0]}`... ")
49
+ text_file_content = _increment_file_name(
50
+ text_file_content=text_file_content,
51
+ file_name=os.path.join(dir_files, text_files[0]),
52
+ )
53
+ else:
54
+ print("\nChoose a text file. Use `*` for process all and `q` to quit:")
55
+ for index, file in enumerate(text_files):
56
+ print(f"[{index}]:", os.path.join(dir_files, file))
57
+
58
+ text_file_option = -1
59
+ while text_file_option < 0 or text_file_option >= len(text_files):
60
+ text_file_option = input("Enter the file number: ")
61
+ if text_file_option == "*":
62
+ for file in text_files:
63
+ text_file_content = _increment_file_name(
64
+ text_file_content=text_file_content,
65
+ file_name=os.path.join(dir_files, file),
66
+ )
67
+ text_file_option = 0
68
+ elif text_file_option == "q":
69
+ return
70
+ elif text_file_option.isdigit():
71
+ text_file_option = int(text_file_option)
72
+ if text_file_option >= 0 and text_file_option < len(text_files):
73
+ text_file_content = _increment_file_name(
74
+ text_file_content=text_file_content,
75
+ file_name=os.path.join(
76
+ dir_files, text_files[int(text_file_option)]
77
+ ),
78
+ )
79
+
80
+ print("🔎 looking for urls...")
81
+ urls_found = put_urls_from_string(
82
+ text_to_process=text_file_content, verbose=verbose
83
+ )
17
84
 
18
- # move_it_to_processed
19
85
  print("--------------------")
20
- print(file_name, "processed")
86
+ print("files processed")
87
+ print(f"📦 {urls_found} urls were extracted and packed into the database")
21
88
 
22
89
 
23
- def put_urls_from_string(text_to_process, parent_url=None):
90
+ def put_urls_from_string(text_to_process, parent_url=None, verbose=False):
24
91
  if isinstance(text_to_process, str):
25
92
  extractor = URLExtract()
26
93
  for url in extractor.find_urls(text_to_process):
27
94
  urls_manager.add_url(url=url, parent_url=parent_url)
28
- print(url, "added")
95
+ if verbose:
96
+ print(url, "added")
29
97
 
30
98
  return len(extractor.find_urls(text_to_process))
31
99
  else: