ohmyscrapper 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ohmyscrapper/__init__.py CHANGED
@@ -20,7 +20,7 @@ from ohmyscrapper.core.config import update
20
20
 
21
21
  def main():
22
22
  parser = argparse.ArgumentParser(prog="ohmyscrapper")
23
- parser.add_argument("--version", action="version", version="%(prog)s v0.6.1")
23
+ parser.add_argument("--version", action="version", version="%(prog)s v0.7.0")
24
24
 
25
25
  update()
26
26
  subparsers = parser.add_subparsers(dest="command", help="Available commands")
@@ -67,6 +67,18 @@ def get_url_types():
67
67
  )
68
68
 
69
69
 
70
+ def get_url_sniffing():
71
+ file = get_files("url_sniffing")
72
+ return config_files.create_and_read_config_file(
73
+ file, default_app_dir=default_app_dir
74
+ )
75
+
76
+
77
+ def append_url_sniffing(data):
78
+ file = get_files("url_sniffing")
79
+ _append_config_file(data, file)
80
+
81
+
70
82
  def append_url_types(url_types):
71
83
  url_types_file = get_files("url_types")
72
84
  _append_config_file(url_types, url_types_file)
@@ -10,6 +10,7 @@ default_dirs:
10
10
 
11
11
  default_files:
12
12
  url_types: url_types.yaml
13
+ url_sniffing: url_sniffing.yaml
13
14
 
14
15
  ai:
15
16
  default_prompt_file: prompt.md
@@ -0,0 +1,25 @@
1
+ linkedin_feed:
2
+ metatags:
3
+ og:url: url_destiny
4
+
5
+ linkedin_job:
6
+ bodytags:
7
+ h1: title
8
+ metatags:
9
+ og:title: title
10
+ og:description: description
11
+ description: description
12
+
13
+ linkedin_post:
14
+ bodytags:
15
+ h1: title
16
+ metatags:
17
+ og:title: title
18
+ og:description: description
19
+ description: description
20
+
21
+ linkedin_redirect:
22
+ metatags:
23
+ og:url: url_destiny
24
+ atags:
25
+ first-tag-as-url_destiny: 5
@@ -29,7 +29,11 @@ def use_connection(func):
29
29
  def provide_connection(*args, **kwargs):
30
30
  global conn
31
31
  with get_db_connection() as conn:
32
- return func(*args, **kwargs)
32
+ try:
33
+ return func(*args, **kwargs)
34
+ except:
35
+ update_db()
36
+ return func(*args, **kwargs)
33
37
 
34
38
  return provide_connection
35
39
 
@@ -38,7 +42,7 @@ def create_tables(conn):
38
42
 
39
43
  c = conn.cursor()
40
44
  c.execute(
41
- "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, h1 TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
45
+ "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, title TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
42
46
  )
43
47
  c.execute(
44
48
  "CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, prompt_file STRING, prompt_name STRING, created_at DATETIME)"
@@ -49,6 +53,14 @@ def create_tables(conn):
49
53
  )
50
54
 
51
55
 
56
+ def update_db():
57
+ try:
58
+ c = conn.cursor()
59
+ c.execute("ALTER TABLE urls RENAME COLUMN h1 TO title")
60
+ except:
61
+ pass
62
+
63
+
52
64
  def seeds(seeds={}):
53
65
 
54
66
  for url_type, url_prefix in seeds.items():
@@ -118,7 +130,7 @@ def get_urls_report():
118
130
  SELECT
119
131
  u.id,
120
132
  u.url,
121
- u.h1
133
+ u.title
122
134
  FROM urls u
123
135
  INNER JOIN parent_url p
124
136
  ON u.url = p.parent_url
@@ -127,9 +139,9 @@ def get_urls_report():
127
139
  u.id,
128
140
  u.url_type,
129
141
  u.url,
130
- COALESCE(u.h1, p.h1) as h1,
142
+ COALESCE(u.title, p.title) as title,
131
143
  p.url as parent_url,
132
- p.h1 as parent_h1
144
+ p.title as parent_title
133
145
  FROM urls u
134
146
  LEFT JOIN parents p
135
147
  ON u.parent_url = p.url
@@ -185,12 +197,12 @@ def get_url_like_unclassified(like_condition):
185
197
 
186
198
 
187
199
  @use_connection
188
- def add_url(url, h1=None, parent_url=None):
200
+ def add_url(url, title=None, parent_url=None):
189
201
  url = clean_url(url)
190
202
  c = conn.cursor()
191
203
 
192
- if h1 is not None:
193
- h1 = h1.strip()
204
+ if title is not None:
205
+ title = title.strip()
194
206
 
195
207
  if parent_url is None:
196
208
  parent_url = None
@@ -199,8 +211,8 @@ def add_url(url, h1=None, parent_url=None):
199
211
 
200
212
  if len(get_url_by_url(url)) == 0:
201
213
  c.execute(
202
- "INSERT INTO urls (url, h1, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
203
- (url, h1, parent_url, int(time.time())),
214
+ "INSERT INTO urls (url, title, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
215
+ (url, title, parent_url, int(time.time())),
204
216
  )
205
217
  conn.commit()
206
218
 
@@ -239,20 +251,20 @@ def set_url_destiny(url, destiny):
239
251
 
240
252
 
241
253
  @use_connection
242
- def set_url_h1(url, value):
254
+ def set_url_title(url, value):
243
255
  value = str(value).strip()
244
256
  url = clean_url(url)
245
257
  c = conn.cursor()
246
- c.execute("UPDATE urls SET h1 = ? WHERE url = ?", (value, url))
258
+ c.execute("UPDATE urls SET title = ? WHERE url = ?", (value, url))
247
259
  conn.commit()
248
260
 
249
261
 
250
262
  @use_connection
251
- def set_url_h1_by_id(id, value):
263
+ def set_url_title_by_id(id, value):
252
264
  value = str(value).strip()
253
265
 
254
266
  c = conn.cursor()
255
- c.execute("UPDATE urls SET h1 = ? WHERE id = ?", (value, id))
267
+ c.execute("UPDATE urls SET title = ? WHERE id = ?", (value, id))
256
268
  conn.commit()
257
269
 
258
270
 
@@ -427,16 +439,16 @@ def merge_dbs() -> None:
427
439
 
428
440
 
429
441
  @use_connection
430
- def merge_url(url, h1, last_touch, created_at, description, json):
442
+ def merge_url(url, title, last_touch, created_at, description, json):
431
443
  url = clean_url(url)
432
444
  c = conn.cursor()
433
445
 
434
- if h1 is not None:
435
- h1 = h1.strip()
446
+ if title is not None:
447
+ title = title.strip()
436
448
 
437
449
  if len(get_url_by_url(url)) == 0:
438
450
  c.execute(
439
- "INSERT INTO urls (url, h1, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
440
- (url, h1, last_touch, created_at, description, json),
451
+ "INSERT INTO urls (url, title, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
452
+ (url, title, last_touch, created_at, description, json),
441
453
  )
442
454
  conn.commit()
@@ -1,11 +1,15 @@
1
1
  import ohmyscrapper.models.urls_manager as urls_manager
2
+ from ohmyscrapper.modules import seed
2
3
  import pandas as pd
3
4
  import time
4
5
 
5
6
 
6
7
  def classify_urls(recursive=False):
7
- urls_manager.seeds()
8
8
  df = urls_manager.get_urls_valid_prefix()
9
+ if len(df) == 0:
10
+ seed.seed()
11
+ classify_urls(recursive=recursive)
12
+ return
9
13
 
10
14
  keep_alive = True
11
15
  while keep_alive:
@@ -28,13 +28,13 @@ def process_ai_response(response):
28
28
  url_parent = urls_manager.get_url_by_id(url_child_xml["id"])
29
29
  if len(url_parent) > 0:
30
30
  url_parent = url_parent.iloc[0]
31
- h1 = url_child_xml.copy()
32
- del h1["id"]
33
- del h1["url"]
34
- h1 = " - ".join(h1.values())
31
+ title = url_child_xml.copy()
32
+ del title["id"]
33
+ del title["url"]
34
+ title = " - ".join(title.values())
35
35
  if url_parent["description_links"] > 1 and url_child_xml["id"] != "":
36
- print("-- child updated -- \n", url_child_xml["url"], ":", h1)
37
- urls_manager.set_url_h1(url_child_xml["url"], h1)
36
+ print("-- child updated -- \n", url_child_xml["url"], ":", title)
37
+ urls_manager.set_url_title(url_child_xml["url"], title)
38
38
  urls_manager.set_url_ai_processed_by_url(
39
39
  url_child_xml["url"], str(json.dumps(url_child_xml))
40
40
  )
@@ -43,8 +43,8 @@ def process_ai_response(response):
43
43
  url_parent["url"], "children-update"
44
44
  )
45
45
  else:
46
- print("-- parent updated -- \n", url_parent["url"], ":", h1)
47
- urls_manager.set_url_h1(url_parent["url"], h1)
46
+ print("-- parent updated -- \n", url_parent["url"], ":", title)
47
+ urls_manager.set_url_title(url_parent["url"], title)
48
48
  urls_manager.set_url_ai_processed_by_url(
49
49
  url_parent["url"], str(json.dumps(url_child_xml))
50
50
  )
@@ -2,154 +2,132 @@ import ohmyscrapper.models.urls_manager as urls_manager
2
2
  import ohmyscrapper.modules.sniff_url as sniff_url
3
3
  import ohmyscrapper.modules.load_txt as load_txt
4
4
  import ohmyscrapper.modules.classify_urls as classify_urls
5
+ from ohmyscrapper.core import config
5
6
 
6
7
  import time
7
8
  import random
8
9
 
9
10
 
10
- def process_linkedin_redirect(url_report, url, verbose=False):
11
- if verbose:
12
- print("linkedin_redirect")
13
-
14
- if url_report["total-a-links"] < 5:
15
- if "first-a-link" in url_report.keys():
16
- url_destiny = url_report["first-a-link"]
17
- else:
18
- urls_manager.set_url_error(url=url["url"], value="error: no first-a-link")
19
- if verbose:
20
- print("no url for:", url["url"])
21
- return
22
- else:
23
- if "og:url" in url_report.keys():
24
- url_destiny = url_report["og:url"]
25
- else:
26
- urls_manager.set_url_error(url=url["url"], value="error: no og:url")
27
- if verbose:
28
- print("no url for:", url["url"])
29
- return
30
- if verbose:
31
- print(url["url"], ">>", url_destiny)
32
- urls_manager.add_url(url=url_destiny)
33
- urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
34
-
11
+ def scrap_url(url, verbose=False):
12
+ if url["url_type"] is None:
13
+ url["url_type"] = "generic"
35
14
 
36
- def process_linkedin_feed(url_report, url, verbose=False):
37
15
  if verbose:
38
- print("linkedin_feed")
16
+ print("\n\n", url["url_type"] + ":", url["url"])
39
17
 
40
- if "og:url" in url_report.keys():
41
- url_destiny = url_report["og:url"]
42
- else:
43
- urls_manager.set_url_error(url=url["url"], value="error: no og:url")
18
+ try:
19
+ url_type = url["url_type"]
20
+ sniffing_config = config.get_url_sniffing()
21
+
22
+ if url_type not in sniffing_config:
23
+ default_type_sniffing = {
24
+ "bodytags": [{"h1": "title"}],
25
+ "metatags": [
26
+ {"og:title": "title"},
27
+ {"og:description": "description"},
28
+ {"description": "description"},
29
+ ],
30
+ }
31
+ config.append_url_sniffing({url_type: default_type_sniffing})
32
+ sniffing_config = config.get_url_sniffing()
33
+
34
+ url_report = sniff_url.get_tags(
35
+ url=url["url"], sniffing_config=sniffing_config[url_type]
36
+ )
37
+ except Exception as e:
38
+ urls_manager.set_url_error(url=url["url"], value="error on scrapping")
39
+ urls_manager.touch_url(url=url["url"])
44
40
  if verbose:
45
- print("no url for:", url["url"])
41
+ print("\n\n!!! ERROR FOR:", url["url"])
42
+ print(
43
+ "\n\n!!! you can check the URL using the command sniff-url",
44
+ url["url"],
45
+ "\n\n",
46
+ )
46
47
  return
47
48
 
48
- if verbose:
49
- print(url["url"], ">>", url_destiny)
50
- urls_manager.add_url(url=url_destiny)
51
- urls_manager.set_url_destiny(url=url["url"], destiny=url_destiny)
52
-
49
+ process_sniffed_url(
50
+ url_report=url_report,
51
+ url=url,
52
+ sniffing_config=sniffing_config[url_type],
53
+ verbose=verbose,
54
+ )
53
55
 
54
- def process_linkedin_job(url_report, url, verbose=False):
55
- if verbose:
56
- print("linkedin_job")
57
- changed = False
58
- if "h1" in url_report.keys():
59
- if verbose:
60
- print(url["url"], ": ", url_report["h1"])
61
- urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
62
- changed = True
63
- elif "og:title" in url_report.keys():
64
- if verbose:
65
- print(url["url"], ": ", url_report["og:title"])
66
- urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
67
- changed = True
56
+ urls_manager.set_url_json(url=url["url"], value=url_report["json"])
57
+ urls_manager.touch_url(url=url["url"])
68
58
 
69
- if "description" in url_report.keys():
70
- urls_manager.set_url_description(
71
- url=url["url"], value=url_report["description"]
72
- )
73
- changed = True
74
- elif "og:description" in url_report.keys():
75
- urls_manager.set_url_description(
76
- url=url["url"], value=url_report["og:description"]
77
- )
78
- changed = True
79
- if not changed:
80
- urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
59
+ return
81
60
 
82
61
 
83
- def process_linkedin_post(url_report, url, verbose=False):
62
+ def process_sniffed_url(url_report, url, sniffing_config, verbose=False):
84
63
  if verbose:
85
- print("linkedin_post or generic")
64
+ print(url["url_type"])
86
65
  print(url["url"])
87
66
  changed = False
88
- if "h1" in url_report.keys():
89
- if verbose:
90
- print(url["url"], ": ", url_report["h1"])
91
- urls_manager.set_url_h1(url=url["url"], value=url_report["h1"])
92
- changed = True
93
- elif "og:title" in url_report.keys():
94
- urls_manager.set_url_h1(url=url["url"], value=url_report["og:title"])
95
- changed = True
96
- description = None
97
- if "description" in url_report.keys():
98
- description = url_report["description"]
99
- changed = True
100
- elif "og:description" in url_report.keys():
101
- description = url_report["og:description"]
67
+
68
+ db_fields = {}
69
+ db_fields["title"] = None
70
+ db_fields["description"] = None
71
+ db_fields["url_destiny"] = None
72
+
73
+ if "metatags" in sniffing_config.keys():
74
+ for tag, bd_field in sniffing_config["metatags"].items():
75
+ if tag in url_report.keys():
76
+ if bd_field[:1] == "+":
77
+ if db_fields[bd_field[1:]] is None:
78
+ db_fields[bd_field[1:]] = ""
79
+ db_fields[bd_field[1:]] = (
80
+ db_fields[bd_field[1:]] + " " + url_report[tag]
81
+ )
82
+ else:
83
+ db_fields[bd_field] = url_report[tag]
84
+
85
+ if "bodytags" in sniffing_config.keys():
86
+ for tag, bd_field in sniffing_config["bodytags"].items():
87
+ if tag in url_report.keys():
88
+ if bd_field[:1] == "+":
89
+ if db_fields[bd_field[1:]] is None:
90
+ db_fields[bd_field[1:]] = ""
91
+ db_fields[bd_field[1:]] = (
92
+ db_fields[bd_field[1:]] + " " + url_report[tag]
93
+ )
94
+ else:
95
+ db_fields[bd_field] = url_report[tag]
96
+
97
+ if (
98
+ "atags" in sniffing_config.keys()
99
+ and "first-tag-as-url_destiny" in sniffing_config["atags"].keys()
100
+ ):
101
+ if (
102
+ url_report["total-a-links"]
103
+ < sniffing_config["atags"]["first-tag-as-url_destiny"]
104
+ ):
105
+ if "first-a-link" in url_report.keys():
106
+ db_fields["url_destiny"] = url_report["first-a-link"]
107
+
108
+ if db_fields["title"] is not None:
109
+ urls_manager.set_url_title(url=url["url"], value=db_fields["title"])
102
110
  changed = True
103
111
 
104
- if description is not None:
105
- urls_manager.set_url_description(url=url["url"], value=description)
112
+ if db_fields["description"] is not None:
113
+ urls_manager.set_url_description(url=url["url"], value=db_fields["description"])
106
114
  description_links = load_txt.put_urls_from_string(
107
- text_to_process=description, parent_url=url["url"]
115
+ text_to_process=db_fields["description"], parent_url=url["url"]
108
116
  )
109
117
  urls_manager.set_url_description_links(url=url["url"], value=description_links)
110
118
 
111
- if not changed:
112
- urls_manager.set_url_error(url=url["url"], value="error: no h1 or description")
113
-
114
-
115
- def scrap_url(url, verbose=False):
116
- # TODO: Need to change this
117
-
118
- if url["url_type"] is None:
119
- if verbose:
120
- print("\n\ngeneric:", url["url"])
121
- url["url_type"] = "generic"
122
- else:
123
- if verbose:
124
- print("\n\n", url["url_type"] + ":", url["url"])
125
- try:
126
- url_report = sniff_url.get_tags(url=url["url"])
127
- except Exception as e:
128
- urls_manager.set_url_error(url=url["url"], value="error")
129
- urls_manager.touch_url(url=url["url"])
130
- if verbose:
131
- print("\n\n!!! ERROR FOR:", url["url"])
132
- print(
133
- "\n\n!!! you can check the URL using the command sniff-url",
134
- url["url"],
135
- "\n\n",
136
- )
137
- return
138
-
139
- if url["url_type"] == "linkedin_redirect":
140
- process_linkedin_redirect(url_report=url_report, url=url, verbose=verbose)
141
-
142
- if url["url_type"] == "linkedin_feed":
143
- process_linkedin_feed(url_report=url_report, url=url, verbose=verbose)
144
-
145
- if url["url_type"] == "linkedin_job":
146
- process_linkedin_job(url_report=url_report, url=url, verbose=verbose)
119
+ changed = True
147
120
 
148
- if url["url_type"] == "linkedin_post" or url["url_type"] == "generic":
149
- process_linkedin_post(url_report=url_report, url=url, verbose=verbose)
121
+ if db_fields["url_destiny"] is not None:
122
+ urls_manager.add_url(url=db_fields["url_destiny"])
123
+ urls_manager.set_url_destiny(url=url["url"], destiny=db_fields["url_destiny"])
124
+ changed = True
150
125
 
151
- urls_manager.set_url_json(url=url["url"], value=url_report["json"])
152
- urls_manager.touch_url(url=url["url"])
126
+ if not changed:
127
+ urls_manager.set_url_error(
128
+ url=url["url"],
129
+ value="error: no title, url_destiny or description was founded",
130
+ )
153
131
 
154
132
 
155
133
  def isNaN(num):
@@ -3,39 +3,62 @@ from bs4 import BeautifulSoup
3
3
  import json
4
4
 
5
5
 
6
- def sniff_url(url="https://www.linkedin.com/in/cesardesouzacardoso/", silent=False):
6
+ def sniff_url(
7
+ url="https://www.linkedin.com/in/cesardesouzacardoso/",
8
+ silent=False,
9
+ sniffing_config={},
10
+ ):
11
+ if "metatags" in sniffing_config:
12
+ metatags_to_search = sniffing_config["metatags"]
13
+ else:
14
+ metatags_to_search = [
15
+ "description",
16
+ "og:url",
17
+ "og:title",
18
+ "og:description",
19
+ "og:type",
20
+ "lnkd:url",
21
+ ]
22
+
23
+ if "bodytags" in sniffing_config:
24
+ body_tags_to_search = sniffing_config["bodytags"]
25
+ else:
26
+ body_tags_to_search = {
27
+ "h1": "",
28
+ "h2": "",
29
+ }
30
+
31
+ if type(metatags_to_search) is dict:
32
+ metatags_to_search = list(metatags_to_search.keys())
33
+
34
+ # force clean concatenate without any separator
35
+ if type(body_tags_to_search) is dict:
36
+ body_tags_to_search = list(body_tags_to_search.keys())
37
+
38
+ if type(body_tags_to_search) is list:
39
+ body_tags_to_search = dict.fromkeys(body_tags_to_search, " ")
40
+
7
41
  if not silent:
8
42
  print("checking url:", url)
9
43
 
10
44
  r = requests.get(url=url)
11
45
  soup = BeautifulSoup(r.text, "html.parser")
12
46
 
13
- metatags_to_search = [
14
- "description",
15
- "og:url",
16
- "og:title",
17
- "og:description",
18
- "og:type",
19
- "lnkd:url",
20
- ]
21
-
22
- text_tags_to_search = {
23
- "h1": "",
24
- "h2": "|",
25
- }
26
-
27
47
  final_report = {}
28
48
  final_report["scrapped-url"] = url
29
- final_report.update(
30
- _extract_meta_tags(
31
- soup=soup, silent=silent, metatags_to_search=metatags_to_search
49
+ if len(metatags_to_search) > 0:
50
+ final_report.update(
51
+ _extract_meta_tags(
52
+ soup=soup, silent=silent, metatags_to_search=metatags_to_search
53
+ )
32
54
  )
33
- )
34
- final_report.update(
35
- _extract_text_tags(
36
- soup=soup, silent=silent, text_tags_to_search=text_tags_to_search
55
+
56
+ if len(body_tags_to_search) > 0:
57
+ final_report.update(
58
+ _extract_text_tags(
59
+ soup=soup, silent=silent, body_tags_to_search=body_tags_to_search
60
+ )
37
61
  )
38
- )
39
62
  final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent)
40
63
  final_report = _complementary_report(final_report, soup, silent).copy()
41
64
  final_report["json"] = json.dumps(final_report)
@@ -85,12 +108,12 @@ def _extract_meta_tags(soup, silent, metatags_to_search):
85
108
  return valid_meta_tags
86
109
 
87
110
 
88
- def _extract_text_tags(soup, silent, text_tags_to_search):
111
+ def _extract_text_tags(soup, silent, body_tags_to_search):
89
112
  valid_text_tags = {}
90
113
  if not silent:
91
114
  print("\n\n\n\n---- all <text> tags ---\n")
92
115
  i = 0
93
- for text_tag, separator in text_tags_to_search.items():
116
+ for text_tag, separator in body_tags_to_search.items():
94
117
  if len(soup.find_all(text_tag)) > 0:
95
118
  valid_text_tags[text_tag] = []
96
119
  for obj_tag in soup.find_all(text_tag):
@@ -128,5 +151,5 @@ def _complementary_report(final_report, soup, silent):
128
151
  return final_report
129
152
 
130
153
 
131
- def get_tags(url):
132
- return sniff_url(url=url, silent=True)
154
+ def get_tags(url, sniffing_config={}):
155
+ return sniff_url(url=url, silent=True, sniffing_config=sniffing_config)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ohmyscrapper
3
- Version: 0.6.1
3
+ Version: 0.7.0
4
4
  Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
5
5
  Author: Cesar Cardoso
6
6
  Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
@@ -16,7 +16,7 @@ Requires-Dist: urlextract>=1.9.0
16
16
  Requires-Python: >=3.11
17
17
  Description-Content-Type: text/markdown
18
18
 
19
- # 🐶 OhMyScrapper - v0.6.1
19
+ # 🐶 OhMyScrapper - v0.7.0
20
20
 
21
21
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
22
22
  final report with general information about job positions.
@@ -0,0 +1,21 @@
1
+ ohmyscrapper/__init__.py,sha256=w5Ty9eszf8tEv72IQrFov0YbZWMqsraq448xhX3YGQs,6493
2
+ ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
+ ohmyscrapper/core/config.py,sha256=i_RA-zReNQIWWmsFar85qzRUqdqvTFMPeCP7Hya7ltU,2996
4
+ ohmyscrapper/core/config_files.py,sha256=KC3yChTnlclclU9EKTqFBoAu9p6XdOKuegub5NPYDDY,2434
5
+ ohmyscrapper/core/default_files/config.yaml,sha256=bgPBVlze2tOCbyrA47h_5BJ35UsXnqsjQszzy0vn-Pw,248
6
+ ohmyscrapper/core/default_files/url_sniffing.yaml,sha256=MKdVR5HQ1i2yTRw2ijzxPSmIyhUno_R4L2k17r3EBBc,417
7
+ ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
8
+ ohmyscrapper/models/urls_manager.py,sha256=FC1j72M1gzNwC_PzPqnew986b-BI6s7zUv8Z7HiM1M0,11849
9
+ ohmyscrapper/modules/classify_urls.py,sha256=GhiosAQUITy1DQe_PksYV9QRKVTgpkSE28dkutzbWVA,1038
10
+ ohmyscrapper/modules/load_txt.py,sha256=dNkUZ2ehBiPx-q4fPczRiHFvnpzCrjeycFtexhWGmEE,3967
11
+ ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
12
+ ohmyscrapper/modules/process_with_ai.py,sha256=kl39Jzl-PUwh6AfmTZ9SLFUYs9Sk4biqgt8rNz3X1FA,7255
13
+ ohmyscrapper/modules/scrap_urls.py,sha256=CNoEC-d1r-u4qxnEVimm4ctP6MJGdU8y8VI2Nx0bBdM,6033
14
+ ohmyscrapper/modules/seed.py,sha256=qDUE7TWx9iNQEzqThK4p7g8pTZjdpkmoqI8kOo_zdtk,983
15
+ ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
16
+ ohmyscrapper/modules/sniff_url.py,sha256=zJ2Uox2aUdQibL4UFLxg3t7GqJ7WwWEl0q3QSUbMEbc,4960
17
+ ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
18
+ ohmyscrapper-0.7.0.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
19
+ ohmyscrapper-0.7.0.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
20
+ ohmyscrapper-0.7.0.dist-info/METADATA,sha256=Doakf4oDT6oskPGdSlEoRJHBxUmm9FhWaHfDlNIfNuM,4096
21
+ ohmyscrapper-0.7.0.dist-info/RECORD,,
@@ -1,20 +0,0 @@
1
- ohmyscrapper/__init__.py,sha256=TGOizxll-06nyJdYSM8SRUccQ5Xhv6dDNW6sIbuH0Mk,6493
2
- ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
- ohmyscrapper/core/config.py,sha256=_me0T6IQqz7bA6Kh6IofNrb-o-07nipcLozUuPrz0l4,2722
4
- ohmyscrapper/core/config_files.py,sha256=KC3yChTnlclclU9EKTqFBoAu9p6XdOKuegub5NPYDDY,2434
5
- ohmyscrapper/core/default_files/config.yaml,sha256=9nMOhnnJUcZudXUq5WBEXCCgezfUKI3m4azIuSch_wQ,214
6
- ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
7
- ohmyscrapper/models/urls_manager.py,sha256=93WvHnk89hA2BfJfDsD2JlZBeRxo2T_F3FfypiRKKHs,11523
8
- ohmyscrapper/modules/classify_urls.py,sha256=4rt7_iPDcCGHhJg-f75wBfFmvjdvQj1xFFP-if_IeFM,926
9
- ohmyscrapper/modules/load_txt.py,sha256=dNkUZ2ehBiPx-q4fPczRiHFvnpzCrjeycFtexhWGmEE,3967
10
- ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
11
- ohmyscrapper/modules/process_with_ai.py,sha256=Th-HMJzQYGQ4UBG8AGFsF5cCKIa1HlPATfmGLTTAE24,7222
12
- ohmyscrapper/modules/scrap_urls.py,sha256=dxpvPyJWtmQj1vZ6IgnhcICWw1eOxYOeplDfZzDTLw4,6864
13
- ohmyscrapper/modules/seed.py,sha256=qDUE7TWx9iNQEzqThK4p7g8pTZjdpkmoqI8kOo_zdtk,983
14
- ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
15
- ohmyscrapper/modules/sniff_url.py,sha256=dF6Nv54TC1Si-FRyqtw4V2WNk3NqaJ1h_PzwZm3UNzk,4126
16
- ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
17
- ohmyscrapper-0.6.1.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
18
- ohmyscrapper-0.6.1.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
19
- ohmyscrapper-0.6.1.dist-info/METADATA,sha256=k06ZCfkLkDuy_GvCj6jAFq2xfCUA5gN8cVlDH-2Q6Bs,4096
20
- ohmyscrapper-0.6.1.dist-info/RECORD,,