ohmyscrapper 0.6.1__py3-none-any.whl → 0.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ohmyscrapper/__init__.py CHANGED
@@ -20,7 +20,7 @@ from ohmyscrapper.core.config import update
20
20
 
21
21
  def main():
22
22
  parser = argparse.ArgumentParser(prog="ohmyscrapper")
23
- parser.add_argument("--version", action="version", version="%(prog)s v0.6.1")
23
+ parser.add_argument("--version", action="version", version="%(prog)s v0.7.4")
24
24
 
25
25
  update()
26
26
  subparsers = parser.add_subparsers(dest="command", help="Available commands")
@@ -28,6 +28,9 @@ def main():
28
28
  "start",
29
29
  help="Make the entire process of 📦 loading, 🐶 scraping and 📜🖋️ exporting with the default configuration.",
30
30
  )
31
+ start_parser.add_argument(
32
+ "-input", default=None, help="File/Folder path or url for pre-loading."
33
+ )
31
34
 
32
35
  start_parser.add_argument(
33
36
  "--ai",
@@ -50,6 +53,14 @@ def main():
50
53
  help="Add all `url_types` from the bank to the `/ohmyscrapper/url_types.yaml` file.",
51
54
  action="store_true",
52
55
  )
56
+
57
+ seed_parser.add_argument(
58
+ "--reset",
59
+ default=False,
60
+ help="Reset all `url_types`.",
61
+ action="store_true",
62
+ )
63
+
53
64
  untouch_parser = subparsers.add_parser(
54
65
  "untouch-all", help="Untouch all urls. That resets classification"
55
66
  )
@@ -85,6 +96,9 @@ def main():
85
96
  scrap_urls_parser.add_argument(
86
97
  "--verbose", default=False, help="Run in verbose mode", action="store_true"
87
98
  )
99
+ scrap_urls_parser.add_argument(
100
+ "-input", default=None, help="File/Folder path or url for pre-loading."
101
+ )
88
102
 
89
103
  sniff_url_parser = subparsers.add_parser("sniff-url", help="🐕 Sniff/Check url")
90
104
  sniff_url_parser.add_argument(
@@ -131,7 +145,7 @@ def main():
131
145
  if args.export:
132
146
  export_url_types_to_file()
133
147
  else:
134
- seed()
148
+ seed(args.reset)
135
149
  return
136
150
 
137
151
  if args.command == "untouch-all":
@@ -143,6 +157,9 @@ def main():
143
157
  return
144
158
 
145
159
  if args.command == "scrap-urls":
160
+ if args.input != None:
161
+ load_txt(file_name=args.input, verbose=args.verbose)
162
+
146
163
  scrap_urls(
147
164
  recursive=args.recursive,
148
165
  ignore_valid_prefix=args.ignore_type,
@@ -182,7 +199,12 @@ def main():
182
199
  return
183
200
 
184
201
  if args.command == "start":
185
- load_txt()
202
+ seed()
203
+ if args.input != None:
204
+ load_txt(file_name=args.input)
205
+ else:
206
+ load_txt()
207
+
186
208
  scrap_urls(
187
209
  recursive=True,
188
210
  ignore_valid_prefix=True,
@@ -39,6 +39,12 @@ def get_ai(param):
39
39
  )
40
40
 
41
41
 
42
+ def get_sniffing(param):
43
+ return config_files.get_param(
44
+ parent_param="sniffing", param=param, default_app_dir=default_app_dir
45
+ )
46
+
47
+
42
48
  def load_config(force_default=False):
43
49
  config_file_name = "config.yaml"
44
50
  config_params = config_files.create_and_read_config_file(
@@ -67,6 +73,18 @@ def get_url_types():
67
73
  )
68
74
 
69
75
 
76
+ def get_url_sniffing():
77
+ file = get_files("url_sniffing")
78
+ return config_files.create_and_read_config_file(
79
+ file, default_app_dir=default_app_dir
80
+ )
81
+
82
+
83
+ def append_url_sniffing(data):
84
+ file = get_files("url_sniffing")
85
+ _append_config_file(data, file)
86
+
87
+
70
88
  def append_url_types(url_types):
71
89
  url_types_file = get_files("url_types")
72
90
  _append_config_file(url_types, url_types_file)
@@ -4,14 +4,29 @@ import yaml
4
4
 
5
5
  def create_and_read_config_file(file_name, default_app_dir, force_default=False):
6
6
  config_file = config_file_path(file_name, default_app_dir)
7
+ default_config_params = _get_default_file(default_file=file_name)
7
8
  if force_default or not os.path.exists(config_file):
8
- config_params = _get_default_file(default_file=file_name)
9
9
  overwrite_config_file(
10
- data=config_params, file_name=file_name, default_app_dir=default_app_dir
10
+ data=default_config_params,
11
+ file_name=file_name,
12
+ default_app_dir=default_app_dir,
11
13
  )
14
+ config_params = default_config_params
12
15
  else:
13
16
  with open(config_file, "r") as f:
14
17
  config_params = yaml.safe_load(f.read())
18
+ if complete_config_file(
19
+ config_params=config_params,
20
+ default_config_params=default_config_params,
21
+ file_name=file_name,
22
+ default_app_dir=default_app_dir,
23
+ ):
24
+ config_params = create_and_read_config_file(
25
+ file_name=file_name,
26
+ default_app_dir=default_app_dir,
27
+ force_default=force_default,
28
+ )
29
+
15
30
  if config_params is None:
16
31
  config_params = create_and_read_config_file(
17
32
  file_name=file_name, default_app_dir=default_app_dir, force_default=True
@@ -19,6 +34,18 @@ def create_and_read_config_file(file_name, default_app_dir, force_default=False)
19
34
  return config_params
20
35
 
21
36
 
37
+ def complete_config_file(
38
+ config_params, default_config_params, file_name, default_app_dir
39
+ ):
40
+ has_updated = False
41
+ for key, values in default_config_params.items():
42
+ if key not in config_params.keys():
43
+ has_updated = True
44
+ data = {key: values}
45
+ append_config_file(data, file_name, default_app_dir)
46
+ return has_updated
47
+
48
+
22
49
  def overwrite_config_file(data, file_name, default_app_dir):
23
50
  config_file = config_file_path(file_name, default_app_dir)
24
51
  with open(config_file, "+w") as f:
@@ -10,6 +10,10 @@ default_dirs:
10
10
 
11
11
  default_files:
12
12
  url_types: url_types.yaml
13
+ url_sniffing: url_sniffing.yaml
13
14
 
14
15
  ai:
15
16
  default_prompt_file: prompt.md
17
+
18
+ sniffing:
19
+ timeout: 10
@@ -0,0 +1,29 @@
1
+ linkedin_feed:
2
+ metatags:
3
+ og:url: url_destiny
4
+
5
+ linkedin_job:
6
+ bodytags:
7
+ h1: title
8
+ metatags:
9
+ og:title: title
10
+ og:description: description
11
+ description: description
12
+
13
+ linkedin_post:
14
+ bodytags:
15
+ h1: title
16
+ metatags:
17
+ og:title: title
18
+ og:description: description
19
+ description: description
20
+
21
+ linkedin_redirect:
22
+ metatags:
23
+ og:url: url_destiny
24
+ atags:
25
+ first-tag-as-url_destiny: 5
26
+
27
+ read_all_a_tags:
28
+ atags:
29
+ load_atags: True
@@ -29,7 +29,11 @@ def use_connection(func):
29
29
  def provide_connection(*args, **kwargs):
30
30
  global conn
31
31
  with get_db_connection() as conn:
32
- return func(*args, **kwargs)
32
+ try:
33
+ return func(*args, **kwargs)
34
+ except:
35
+ update_db()
36
+ return func(*args, **kwargs)
33
37
 
34
38
  return provide_connection
35
39
 
@@ -38,7 +42,7 @@ def create_tables(conn):
38
42
 
39
43
  c = conn.cursor()
40
44
  c.execute(
41
- "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, h1 TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
45
+ "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, title TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
42
46
  )
43
47
  c.execute(
44
48
  "CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, prompt_file STRING, prompt_name STRING, created_at DATETIME)"
@@ -49,6 +53,14 @@ def create_tables(conn):
49
53
  )
50
54
 
51
55
 
56
+ def update_db():
57
+ try:
58
+ c = conn.cursor()
59
+ c.execute("ALTER TABLE urls RENAME COLUMN h1 TO title")
60
+ except:
61
+ pass
62
+
63
+
52
64
  def seeds(seeds={}):
53
65
 
54
66
  for url_type, url_prefix in seeds.items():
@@ -57,6 +69,14 @@ def seeds(seeds={}):
57
69
  return True
58
70
 
59
71
 
72
+ @use_connection
73
+ def reset_seeds():
74
+ sql = "DELETE FROM urls_valid_prefix"
75
+ c = conn.cursor()
76
+ c.execute(sql)
77
+ conn.commit()
78
+
79
+
60
80
  @use_connection
61
81
  def add_urls_valid_prefix(url_prefix, url_type):
62
82
 
@@ -118,7 +138,7 @@ def get_urls_report():
118
138
  SELECT
119
139
  u.id,
120
140
  u.url,
121
- u.h1
141
+ u.title
122
142
  FROM urls u
123
143
  INNER JOIN parent_url p
124
144
  ON u.url = p.parent_url
@@ -127,9 +147,9 @@ def get_urls_report():
127
147
  u.id,
128
148
  u.url_type,
129
149
  u.url,
130
- COALESCE(u.h1, p.h1) as h1,
150
+ COALESCE(u.title, p.title) as title,
131
151
  p.url as parent_url,
132
- p.h1 as parent_h1
152
+ p.title as parent_title
133
153
  FROM urls u
134
154
  LEFT JOIN parents p
135
155
  ON u.parent_url = p.url
@@ -185,12 +205,14 @@ def get_url_like_unclassified(like_condition):
185
205
 
186
206
 
187
207
  @use_connection
188
- def add_url(url, h1=None, parent_url=None):
208
+ def add_url(url, title=None, parent_url=None):
209
+ if url[:1] == "/":
210
+ return
189
211
  url = clean_url(url)
190
212
  c = conn.cursor()
191
213
 
192
- if h1 is not None:
193
- h1 = h1.strip()
214
+ if title is not None:
215
+ title = title.strip()
194
216
 
195
217
  if parent_url is None:
196
218
  parent_url = None
@@ -199,8 +221,8 @@ def add_url(url, h1=None, parent_url=None):
199
221
 
200
222
  if len(get_url_by_url(url)) == 0:
201
223
  c.execute(
202
- "INSERT INTO urls (url, h1, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
203
- (url, h1, parent_url, int(time.time())),
224
+ "INSERT INTO urls (url, title, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
225
+ (url, title, parent_url, int(time.time())),
204
226
  )
205
227
  conn.commit()
206
228
 
@@ -239,20 +261,20 @@ def set_url_destiny(url, destiny):
239
261
 
240
262
 
241
263
  @use_connection
242
- def set_url_h1(url, value):
264
+ def set_url_title(url, value):
243
265
  value = str(value).strip()
244
266
  url = clean_url(url)
245
267
  c = conn.cursor()
246
- c.execute("UPDATE urls SET h1 = ? WHERE url = ?", (value, url))
268
+ c.execute("UPDATE urls SET title = ? WHERE url = ?", (value, url))
247
269
  conn.commit()
248
270
 
249
271
 
250
272
  @use_connection
251
- def set_url_h1_by_id(id, value):
273
+ def set_url_title_by_id(id, value):
252
274
  value = str(value).strip()
253
275
 
254
276
  c = conn.cursor()
255
- c.execute("UPDATE urls SET h1 = ? WHERE id = ?", (value, id))
277
+ c.execute("UPDATE urls SET title = ? WHERE id = ?", (value, id))
256
278
  conn.commit()
257
279
 
258
280
 
@@ -328,7 +350,9 @@ def set_url_error(url, value):
328
350
  @use_connection
329
351
  def set_url_type_by_id(url_id, url_type):
330
352
  c = conn.cursor()
331
- c.execute(f"UPDATE urls SET url_type = '{url_type}' WHERE id = {url_id}")
353
+ c.execute(
354
+ f"UPDATE urls SET url_type = '{url_type}', last_touch = NULL WHERE id = {url_id}"
355
+ )
332
356
  conn.commit()
333
357
 
334
358
 
@@ -380,8 +404,10 @@ def touch_url(url):
380
404
  @use_connection
381
405
  def untouch_url(url):
382
406
  url = clean_url(url)
407
+ url = str(url.strip())
408
+
383
409
  c = conn.cursor()
384
- c.execute("UPDATE urls SET last_touch = NULL WHERE url = ?", (url))
410
+ c.execute(f"UPDATE urls SET last_touch = NULL, url_type = NULL WHERE url = '{url}'")
385
411
  conn.commit()
386
412
 
387
413
 
@@ -427,16 +453,16 @@ def merge_dbs() -> None:
427
453
 
428
454
 
429
455
  @use_connection
430
- def merge_url(url, h1, last_touch, created_at, description, json):
456
+ def merge_url(url, title, last_touch, created_at, description, json):
431
457
  url = clean_url(url)
432
458
  c = conn.cursor()
433
459
 
434
- if h1 is not None:
435
- h1 = h1.strip()
460
+ if title is not None:
461
+ title = title.strip()
436
462
 
437
463
  if len(get_url_by_url(url)) == 0:
438
464
  c.execute(
439
- "INSERT INTO urls (url, h1, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
440
- (url, h1, last_touch, created_at, description, json),
465
+ "INSERT INTO urls (url, title, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
466
+ (url, title, last_touch, created_at, description, json),
441
467
  )
442
468
  conn.commit()
@@ -1,11 +1,15 @@
1
1
  import ohmyscrapper.models.urls_manager as urls_manager
2
+ from ohmyscrapper.modules import seed
2
3
  import pandas as pd
3
4
  import time
4
5
 
5
6
 
6
7
  def classify_urls(recursive=False):
7
- urls_manager.seeds()
8
8
  df = urls_manager.get_urls_valid_prefix()
9
+ if len(df) == 0:
10
+ seed.seed()
11
+ classify_urls(recursive=recursive)
12
+ return
9
13
 
10
14
  keep_alive = True
11
15
  while keep_alive:
@@ -19,14 +19,16 @@ def load_txt(file_name="input", verbose=False):
19
19
 
20
20
  text_file_content = ""
21
21
  if file_name is not None and not os.path.isdir(file_name):
22
- print(f"📖 reading file `{file_name}`... ")
23
22
  if not os.path.exists(file_name):
24
23
  if file_name.startswith("https://") or file_name.startswith("http://"):
24
+ print(f"📖 reading url `{file_name}`... ")
25
25
  text_file_content = " " + file_name + " "
26
+ urls_manager.untouch_url(url=file_name)
26
27
  else:
27
28
  print(f"\n file `{file_name}` not found.")
28
29
  return
29
30
  else:
31
+ print(f"📖 reading file `{file_name}`... ")
30
32
  text_file_content = _increment_file_name(
31
33
  text_file_content=text_file_content, file_name=file_name
32
34
  )
@@ -51,13 +53,15 @@ def load_txt(file_name="input", verbose=False):
51
53
  file_name=os.path.join(dir_files, text_files[0]),
52
54
  )
53
55
  else:
54
- print("\nChoose a text file. Use `*` for process all and `q` to quit:")
56
+ print("\nFiles list:")
55
57
  for index, file in enumerate(text_files):
56
58
  print(f"[{index}]:", os.path.join(dir_files, file))
57
59
 
58
60
  text_file_option = -1
59
61
  while text_file_option < 0 or text_file_option >= len(text_files):
60
- text_file_option = input("Enter the file number: ")
62
+ text_file_option = input(
63
+ "Choose a text file. Use `*` for process all and `q` to quit. Enter the file number: "
64
+ )
61
65
  if text_file_option == "*":
62
66
  for file in text_files:
63
67
  text_file_content = _increment_file_name(
@@ -28,13 +28,13 @@ def process_ai_response(response):
28
28
  url_parent = urls_manager.get_url_by_id(url_child_xml["id"])
29
29
  if len(url_parent) > 0:
30
30
  url_parent = url_parent.iloc[0]
31
- h1 = url_child_xml.copy()
32
- del h1["id"]
33
- del h1["url"]
34
- h1 = " - ".join(h1.values())
31
+ title = url_child_xml.copy()
32
+ del title["id"]
33
+ del title["url"]
34
+ title = " - ".join(title.values())
35
35
  if url_parent["description_links"] > 1 and url_child_xml["id"] != "":
36
- print("-- child updated -- \n", url_child_xml["url"], ":", h1)
37
- urls_manager.set_url_h1(url_child_xml["url"], h1)
36
+ print("-- child updated -- \n", url_child_xml["url"], ":", title)
37
+ urls_manager.set_url_title(url_child_xml["url"], title)
38
38
  urls_manager.set_url_ai_processed_by_url(
39
39
  url_child_xml["url"], str(json.dumps(url_child_xml))
40
40
  )
@@ -43,8 +43,8 @@ def process_ai_response(response):
43
43
  url_parent["url"], "children-update"
44
44
  )
45
45
  else:
46
- print("-- parent updated -- \n", url_parent["url"], ":", h1)
47
- urls_manager.set_url_h1(url_parent["url"], h1)
46
+ print("-- parent updated -- \n", url_parent["url"], ":", title)
47
+ urls_manager.set_url_title(url_parent["url"], title)
48
48
  urls_manager.set_url_ai_processed_by_url(
49
49
  url_parent["url"], str(json.dumps(url_child_xml))
50
50
  )
@@ -2,154 +2,138 @@ import ohmyscrapper.models.urls_manager as urls_manager
2
2
  import ohmyscrapper.modules.sniff_url as sniff_url
3
3
  import ohmyscrapper.modules.load_txt as load_txt
4
4
  import ohmyscrapper.modules.classify_urls as classify_urls
5
+ from ohmyscrapper.core import config
5
6
 
6
7
  import time
7
8
  import random
8
9
 
9
10
 
10
- def process_linkedin_redirect(url_report, url, verbose=False):
11
- if verbose:
12
- print("linkedin_redirect")
13
-
14
- if url_report["total-a-links"] < 5:
15
- if "first-a-link" in url_report.keys():
16
- url_destiny = url_report["first-a-link"]
17
- else:
18
- urls_manager.set_url_error(url=url["url"], value="error: no first-a-link")
19
- if verbose:
20
- print("no url for:", url["url"])
21
- return
22
- else:
23
- if "og:url" in url_report.keys():
24
- url_destiny = url_report["og:url"]
25
- else:
26
- urls_manager.set_url_error(url=url["url"], value="error: no og:url")
27
- if verbose:
28
- print("no url for:", url["url"])
29
- return
30
- if verbose:
31
- print(url["url"], ">>", url_destiny)
32
- urls_manager.add_url(url=url_destiny)
33
- urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
34
-
11
+ def scrap_url(url, verbose=False):
12
+ if url["url_type"] is None:
13
+ url["url_type"] = "generic"
35
14
 
36
- def process_linkedin_feed(url_report, url, verbose=False):
37
15
  if verbose:
38
- print("linkedin_feed")
16
+ print("\n\n", url["url_type"] + ":", url["url"])
39
17
 
40
- if "og:url" in url_report.keys():
41
- url_destiny = url_report["og:url"]
42
- else:
43
- urls_manager.set_url_error(url=url["url"], value="error: no og:url")
18
+ try:
19
+ url_type = url["url_type"]
20
+ sniffing_config = config.get_url_sniffing()
21
+
22
+ if url_type not in sniffing_config:
23
+ default_type_sniffing = {
24
+ "bodytags": {"h1": "title"},
25
+ "metatags": {
26
+ "og:title": "title",
27
+ "og:description": "description",
28
+ "description": "description",
29
+ },
30
+ }
31
+ config.append_url_sniffing({url_type: default_type_sniffing})
32
+ sniffing_config = config.get_url_sniffing()
33
+
34
+ url_report = sniff_url.get_tags(
35
+ url=url["url"], sniffing_config=sniffing_config[url_type]
36
+ )
37
+ except Exception as e:
38
+ urls_manager.set_url_error(url=url["url"], value="error on scrapping")
39
+ urls_manager.touch_url(url=url["url"])
44
40
  if verbose:
45
- print("no url for:", url["url"])
41
+ print("\n\n!!! ERROR FOR:", url["url"])
42
+ print(
43
+ "\n\n!!! you can check the URL using the command sniff-url",
44
+ url["url"],
45
+ "\n\n",
46
+ )
46
47
  return
47
48
 
48
- if verbose:
49
- print(url["url"], ">>", url_destiny)
50
- urls_manager.add_url(url=url_destiny)
51
- urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
52
-
49
+ process_sniffed_url(
50
+ url_report=url_report,
51
+ url=url,
52
+ sniffing_config=sniffing_config[url_type],
53
+ verbose=verbose,
54
+ )
53
55
 
54
- def process_linkedin_job(url_report, url, verbose=False):
55
- if verbose:
56
- print("linkedin_job")
57
- changed = False
58
- if "h1" in url_report.keys():
59
- if verbose:
60
- print(url["url"], ": ", url_report["h1"])
61
- urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
62
- changed = True
63
- elif "og:title" in url_report.keys():
64
- if verbose:
65
- print(url["url"], ": ", url_report["og:title"])
66
- urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
67
- changed = True
56
+ urls_manager.set_url_json(url=url["url"], value=url_report["json"])
57
+ urls_manager.touch_url(url=url["url"])
68
58
 
69
- if "description" in url_report.keys():
70
- urls_manager.set_url_description(
71
- url=url["url"], value=url_report["description"]
72
- )
73
- changed = True
74
- elif "og:description" in url_report.keys():
75
- urls_manager.set_url_description(
76
- url=url["url"], value=url_report["og:description"]
77
- )
78
- changed = True
79
- if not changed:
80
- urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
59
+ return
81
60
 
82
61
 
83
- def process_linkedin_post(url_report, url, verbose=False):
62
+ def process_sniffed_url(url_report, url, sniffing_config, verbose=False):
84
63
  if verbose:
85
- print("linkedin_post or generic")
64
+ print(url["url_type"])
86
65
  print(url["url"])
87
66
  changed = False
88
- if "h1" in url_report.keys():
89
- if verbose:
90
- print(url["url"], ": ", url_report["h1"])
91
- urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
92
- changed = True
93
- elif "og:title" in url_report.keys():
94
- urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
95
- changed = True
96
- description = None
97
- if "description" in url_report.keys():
98
- description = url_report["description"]
99
- changed = True
100
- elif "og:description" in url_report.keys():
101
- description = url_report["og:description"]
67
+
68
+ db_fields = {}
69
+ db_fields["title"] = None
70
+ db_fields["description"] = None
71
+ db_fields["url_destiny"] = None
72
+
73
+ if "metatags" in sniffing_config.keys():
74
+ for tag, bd_field in sniffing_config["metatags"].items():
75
+ if tag in url_report.keys():
76
+ if bd_field[:1] == "+":
77
+ if db_fields[bd_field[1:]] is None:
78
+ db_fields[bd_field[1:]] = ""
79
+ db_fields[bd_field[1:]] = (
80
+ db_fields[bd_field[1:]] + " " + url_report[tag]
81
+ )
82
+ else:
83
+ db_fields[bd_field] = url_report[tag]
84
+
85
+ if "bodytags" in sniffing_config.keys():
86
+ for tag, bd_field in sniffing_config["bodytags"].items():
87
+ if tag in url_report.keys():
88
+ if bd_field[:1] == "+":
89
+ if db_fields[bd_field[1:]] is None:
90
+ db_fields[bd_field[1:]] = ""
91
+ db_fields[bd_field[1:]] = (
92
+ db_fields[bd_field[1:]] + " " + url_report[tag]
93
+ )
94
+ else:
95
+ db_fields[bd_field] = url_report[tag]
96
+
97
+ if (
98
+ "atags" in sniffing_config.keys()
99
+ and "first-tag-as-url_destiny" in sniffing_config["atags"].keys()
100
+ ):
101
+ if (
102
+ url_report["total-a-links"]
103
+ < sniffing_config["atags"]["first-tag-as-url_destiny"]
104
+ ):
105
+ if "first-a-link" in url_report.keys():
106
+ db_fields["url_destiny"] = url_report["first-a-link"]
107
+ if (
108
+ "atags" in sniffing_config.keys()
109
+ and "load_links" in sniffing_config["atags"].keys()
110
+ ):
111
+ for a_link in url_report["a_links"]:
112
+ urls_manager.add_url(url=a_link["href"], parent_url=url["url"])
113
+
114
+ if db_fields["title"] is not None:
115
+ urls_manager.set_url_title(url=url["url"], value=db_fields["title"])
102
116
  changed = True
103
117
 
104
- if description is not None:
105
- urls_manager.set_url_description(url=url["url"], value=description)
118
+ if db_fields["description"] is not None:
119
+ urls_manager.set_url_description(url=url["url"], value=db_fields["description"])
106
120
  description_links = load_txt.put_urls_from_string(
107
- text_to_process=description, parent_url=url["url"]
121
+ text_to_process=db_fields["description"], parent_url=url["url"]
108
122
  )
109
123
  urls_manager.set_url_description_links(url=url["url"], value=description_links)
110
124
 
111
- if not changed:
112
- urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
113
-
114
-
115
- def scrap_url(url, verbose=False):
116
- # TODO: Need to change this
117
-
118
- if url["url_type"] is None:
119
- if verbose:
120
- print("\n\ngeneric:", url["url"])
121
- url["url_type"] = "generic"
122
- else:
123
- if verbose:
124
- print("\n\n", url["url_type"] + ":", url["url"])
125
- try:
126
- url_report = sniff_url.get_tags(url=url["url"])
127
- except Exception as e:
128
- urls_manager.set_url_error(url=url["url"], value="error")
129
- urls_manager.touch_url(url=url["url"])
130
- if verbose:
131
- print("\n\n!!! ERROR FOR:", url["url"])
132
- print(
133
- "\n\n!!! you can check the URL using the command sniff-url",
134
- url["url"],
135
- "\n\n",
136
- )
137
- return
138
-
139
- if url["url_type"] == "linkedin_redirect":
140
- process_linkedin_redirect(url_report=url_report, url=url, verbose=verbose)
141
-
142
- if url["url_type"] == "linkedin_feed":
143
- process_linkedin_feed(url_report=url_report, url=url, verbose=verbose)
144
-
145
- if url["url_type"] == "linkedin_job":
146
- process_linkedin_job(url_report=url_report, url=url, verbose=verbose)
125
+ changed = True
147
126
 
148
- if url["url_type"] == "linkedin_post" or url["url_type"] == "generic":
149
- process_linkedin_post(url_report=url_report, url=url, verbose=verbose)
127
+ if db_fields["url_destiny"] is not None:
128
+ urls_manager.add_url(url=db_fields["url_destiny"])
129
+ urls_manager.set_url_destiny(url=url["url"], destiny=db_fields["url_destiny"])
130
+ changed = True
150
131
 
151
- urls_manager.set_url_json(url=url["url"], value=url_report["json"])
152
- urls_manager.touch_url(url=url["url"])
132
+ if not changed:
133
+ urls_manager.set_url_error(
134
+ url=url["url"],
135
+ value="error: no title, url_destiny or description was founded",
136
+ )
153
137
 
154
138
 
155
139
  def isNaN(num):
@@ -2,7 +2,10 @@ import ohmyscrapper.models.urls_manager as urls_manager
2
2
  from ohmyscrapper.core import config
3
3
 
4
4
 
5
- def seed():
5
+ def seed(reset=False):
6
+ if reset:
7
+ urls_manager.reset_seeds()
8
+
6
9
  if not config.url_types_file_exists():
7
10
  db_url_types = urls_manager.get_urls_valid_prefix()
8
11
  if len(db_url_types) > 0:
@@ -1,41 +1,74 @@
1
1
  import requests
2
2
  from bs4 import BeautifulSoup
3
3
  import json
4
+ from ohmyscrapper.core import config
4
5
 
5
6
 
6
- def sniff_url(url="https://www.linkedin.com/in/cesardesouzacardoso/", silent=False):
7
- if not silent:
8
- print("checking url:", url)
7
+ def sniff_url(
8
+ url="https://www.linkedin.com/in/cesardesouzacardoso/",
9
+ silent=False,
10
+ sniffing_config={},
11
+ ):
12
+ final_report = {}
13
+ final_report["error"] = None
14
+ if "metatags" in sniffing_config:
15
+ metatags_to_search = sniffing_config["metatags"]
16
+ else:
17
+ metatags_to_search = [
18
+ "description",
19
+ "og:url",
20
+ "og:title",
21
+ "og:description",
22
+ "og:type",
23
+ "lnkd:url",
24
+ ]
25
+
26
+ if "bodytags" in sniffing_config:
27
+ body_tags_to_search = sniffing_config["bodytags"]
28
+ else:
29
+ body_tags_to_search = {
30
+ "h1": "",
31
+ "h2": "",
32
+ }
9
33
 
10
- r = requests.get(url=url)
11
- soup = BeautifulSoup(r.text, "html.parser")
34
+ if type(metatags_to_search) is dict:
35
+ metatags_to_search = list(metatags_to_search.keys())
12
36
 
13
- metatags_to_search = [
14
- "description",
15
- "og:url",
16
- "og:title",
17
- "og:description",
18
- "og:type",
19
- "lnkd:url",
20
- ]
37
+ # force clean concatenate without any separator
38
+ if type(body_tags_to_search) is dict:
39
+ body_tags_to_search = list(body_tags_to_search.keys())
21
40
 
22
- text_tags_to_search = {
23
- "h1": "",
24
- "h2": "|",
25
- }
41
+ if type(body_tags_to_search) is list:
42
+ body_tags_to_search = dict.fromkeys(body_tags_to_search, " ")
43
+
44
+ if not silent:
45
+ print("checking url:", url)
46
+
47
+ try:
48
+ r = requests.get(url=url, timeout=config.get_sniffing("timeout"))
49
+ soup = BeautifulSoup(r.text, "html.parser")
50
+ except requests.exceptions.ReadTimeout:
51
+ url_domain = url.split("/")[2]
52
+ final_report["error"] = (
53
+ f"!!! timeout (10 seconds) while checking the url with domain: `{url_domain}` !!!"
54
+ )
55
+ print(f"\n\n{final_report['error']}\n\n")
56
+ soup = BeautifulSoup("", "html.parser")
26
57
 
27
- final_report = {}
28
58
  final_report["scrapped-url"] = url
29
- final_report.update(
30
- _extract_meta_tags(
31
- soup=soup, silent=silent, metatags_to_search=metatags_to_search
59
+ if len(metatags_to_search) > 0:
60
+ final_report.update(
61
+ _extract_meta_tags(
62
+ soup=soup, silent=silent, metatags_to_search=metatags_to_search
63
+ )
32
64
  )
33
- )
34
- final_report.update(
35
- _extract_text_tags(
36
- soup=soup, silent=silent, text_tags_to_search=text_tags_to_search
65
+
66
+ if len(body_tags_to_search) > 0:
67
+ final_report.update(
68
+ _extract_text_tags(
69
+ soup=soup, silent=silent, body_tags_to_search=body_tags_to_search
70
+ )
37
71
  )
38
- )
39
72
  final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent)
40
73
  final_report = _complementary_report(final_report, soup, silent).copy()
41
74
  final_report["json"] = json.dumps(final_report)
@@ -85,24 +118,24 @@ def _extract_meta_tags(soup, silent, metatags_to_search):
85
118
  return valid_meta_tags
86
119
 
87
120
 
88
- def _extract_text_tags(soup, silent, text_tags_to_search):
121
+ def _extract_text_tags(soup, silent, body_tags_to_search):
89
122
  valid_text_tags = {}
90
123
  if not silent:
91
124
  print("\n\n\n\n---- all <text> tags ---\n")
92
125
  i = 0
93
- for text_tag, separator in text_tags_to_search.items():
126
+ for text_tag, separator in body_tags_to_search.items():
94
127
  if len(soup.find_all(text_tag)) > 0:
95
128
  valid_text_tags[text_tag] = []
96
129
  for obj_tag in soup.find_all(text_tag):
97
130
  valid_text_tags[text_tag].append(obj_tag.text.strip())
98
131
  valid_text_tags[text_tag] = separator.join(valid_text_tags[text_tag])
99
- i = i + 1
100
- if not silent:
101
- print("-- text tag", i, "--")
102
- print("name:", text_tag)
103
- print("separator:", separator)
104
- print("texts:", valid_text_tags[text_tag])
105
- print("---------------- \n")
132
+ i = i + 1
133
+ if not silent:
134
+ print("-- text tag", i, "--")
135
+ print("name:", text_tag)
136
+ print("separator:", separator)
137
+ print("texts:", valid_text_tags[text_tag])
138
+ print("---------------- \n")
106
139
  return valid_text_tags
107
140
 
108
141
 
@@ -128,5 +161,5 @@ def _complementary_report(final_report, soup, silent):
128
161
  return final_report
129
162
 
130
163
 
131
- def get_tags(url):
132
- return sniff_url(url=url, silent=True)
164
+ def get_tags(url, sniffing_config={}):
165
+ return sniff_url(url=url, silent=True, sniffing_config=sniffing_config)
@@ -1,9 +1,10 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: ohmyscrapper
3
- Version: 0.6.1
3
+ Version: 0.7.4
4
4
  Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
5
5
  Author: Cesar Cardoso
6
6
  Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
7
+ License-Expression: MIT
7
8
  Requires-Dist: beautifulsoup4>=4.14.3
8
9
  Requires-Dist: google-genai>=1.55.0
9
10
  Requires-Dist: markdown>=3.10
@@ -14,9 +15,11 @@ Requires-Dist: requests>=2.32.5
14
15
  Requires-Dist: rich>=14.2.0
15
16
  Requires-Dist: urlextract>=1.9.0
16
17
  Requires-Python: >=3.11
18
+ Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
19
+ Project-URL: Repository, https://github.com/bouli/ohmyscrapper
17
20
  Description-Content-Type: text/markdown
18
21
 
19
- # 🐶 OhMyScrapper - v0.6.1
22
+ # 🐶 OhMyScrapper - v0.7.4
20
23
 
21
24
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
22
25
  final report with general information about job positions.
@@ -0,0 +1,21 @@
1
+ ohmyscrapper/__init__.py,sha256=x3wLMhIU744W9DRtXoTrPpWghb7UdC3UJSYZh_gpzlw,7095
2
+ ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
+ ohmyscrapper/core/config.py,sha256=aaSLxk6Fuzp88EMax6MAOX3WszH4OfYLz_dJoXlu0ME,3142
4
+ ohmyscrapper/core/config_files.py,sha256=C79-Vgz1E5_jUWtob-yrCyBxsqWEXxqPI_r6TL7D1_Q,3314
5
+ ohmyscrapper/core/default_files/config.yaml,sha256=gi8tqhSumQYJIl8QDisJ6eaib2tdcBNT-GFU-e6Dtns,273
6
+ ohmyscrapper/core/default_files/url_sniffing.yaml,sha256=RU5GYWmC1PdBl4nn7HUfRBwuXz8Rlap75d4W3zWDzPM,465
7
+ ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
8
+ ohmyscrapper/models/urls_manager.py,sha256=k0N1If4YoRUWHX80OyBNEeJNIzDROc2ur6j8q2OBlqo,12103
9
+ ohmyscrapper/modules/classify_urls.py,sha256=GhiosAQUITy1DQe_PksYV9QRKVTgpkSE28dkutzbWVA,1038
10
+ ohmyscrapper/modules/load_txt.py,sha256=pkWBIdh6vORPfENDZ6wGM89vswnOnc1flqKfkLs9RD8,4138
11
+ ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
12
+ ohmyscrapper/modules/process_with_ai.py,sha256=kl39Jzl-PUwh6AfmTZ9SLFUYs9Sk4biqgt8rNz3X1FA,7255
13
+ ohmyscrapper/modules/scrap_urls.py,sha256=uN5j0dychVMGu7n1rcpYdba4sqc47ssyCn0tVaiz-Ic,6264
14
+ ohmyscrapper/modules/seed.py,sha256=hHEGSoPXsmclTaRPeIcK2oC1Xpg3_JqBv_YFMD0m5Jw,1044
15
+ ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
16
+ ohmyscrapper/modules/sniff_url.py,sha256=1QnxEdCWLjLh0uM72dlPzst64qglqg2MHA_xYlNcLSA,5435
17
+ ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
18
+ ohmyscrapper-0.7.4.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
19
+ ohmyscrapper-0.7.4.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
20
+ ohmyscrapper-0.7.4.dist-info/METADATA,sha256=CVE8WUcraUtONy9UVIU0y8Y7wjsk4zEmMVfpA_al1CU,4261
21
+ ohmyscrapper-0.7.4.dist-info/RECORD,,
@@ -1,20 +0,0 @@
1
- ohmyscrapper/__init__.py,sha256=TGOizxll-06nyJdYSM8SRUccQ5Xhv6dDNW6sIbuH0Mk,6493
2
- ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
- ohmyscrapper/core/config.py,sha256=_me0T6IQqz7bA6Kh6IofNrb-o-07nipcLozUuPrz0l4,2722
4
- ohmyscrapper/core/config_files.py,sha256=KC3yChTnlclclU9EKTqFBoAu9p6XdOKuegub5NPYDDY,2434
5
- ohmyscrapper/core/default_files/config.yaml,sha256=9nMOhnnJUcZudXUq5WBEXCCgezfUKI3m4azIuSch_wQ,214
6
- ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
7
- ohmyscrapper/models/urls_manager.py,sha256=93WvHnk89hA2BfJfDsD2JlZBeRxo2T_F3FfypiRKKHs,11523
8
- ohmyscrapper/modules/classify_urls.py,sha256=4rt7_iPDcCGHhJg-f75wBfFmvjdvQj1xFFP-if_IeFM,926
9
- ohmyscrapper/modules/load_txt.py,sha256=dNkUZ2ehBiPx-q4fPczRiHFvnpzCrjeycFtexhWGmEE,3967
10
- ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
11
- ohmyscrapper/modules/process_with_ai.py,sha256=Th-HMJzQYGQ4UBG8AGFsF5cCKIa1HlPATfmGLTTAE24,7222
12
- ohmyscrapper/modules/scrap_urls.py,sha256=dxpvPyJWtmQj1vZ6IgnhcICWw1eOxYOeplDfZzDTLw4,6864
13
- ohmyscrapper/modules/seed.py,sha256=qDUE7TWx9iNQEzqThK4p7g8pTZjdpkmoqI8kOo_zdtk,983
14
- ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
15
- ohmyscrapper/modules/sniff_url.py,sha256=dF6Nv54TC1Si-FRyqtw4V2WNk3NqaJ1h_PzwZm3UNzk,4126
16
- ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
17
- ohmyscrapper-0.6.1.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
18
- ohmyscrapper-0.6.1.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
19
- ohmyscrapper-0.6.1.dist-info/METADATA,sha256=k06ZCfkLkDuy_GvCj6jAFq2xfCUA5gN8cVlDH-2Q6Bs,4096
20
- ohmyscrapper-0.6.1.dist-info/RECORD,,