ohmyscrapper 0.7.4__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ohmyscrapper/__init__.py CHANGED
@@ -1,26 +1,27 @@
1
1
  import argparse
2
2
 
3
+ from ohmyscrapper.core.config import update
4
+ from ohmyscrapper.modules import cache
3
5
  from ohmyscrapper.modules.classify_urls import classify_urls
4
- from ohmyscrapper.modules.sniff_url import sniff_url
5
6
  from ohmyscrapper.modules.load_txt import load_txt
6
- from ohmyscrapper.modules.seed import seed, export_url_types_to_file
7
+ from ohmyscrapper.modules.merge_dbs import merge_dbs
8
+ from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
7
9
  from ohmyscrapper.modules.scrap_urls import scrap_urls
10
+ from ohmyscrapper.modules.seed import export_url_types_to_file, seed
8
11
  from ohmyscrapper.modules.show import (
12
+ export_report,
13
+ export_urls,
9
14
  show_url,
10
15
  show_urls,
11
16
  show_urls_valid_prefix,
12
- export_urls,
13
- export_report,
14
17
  )
18
+ from ohmyscrapper.modules.sniff_url import sniff_url
15
19
  from ohmyscrapper.modules.untouch_all import untouch_all
16
- from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
17
- from ohmyscrapper.modules.merge_dbs import merge_dbs
18
- from ohmyscrapper.core.config import update
19
20
 
20
21
 
21
22
  def main():
22
23
  parser = argparse.ArgumentParser(prog="ohmyscrapper")
23
- parser.add_argument("--version", action="version", version="%(prog)s v0.7.4")
24
+ parser.add_argument("--version", action="version", version="%(prog)s v0.8.4")
24
25
 
25
26
  update()
26
27
  subparsers = parser.add_subparsers(dest="command", help="Available commands")
@@ -104,6 +105,16 @@ def main():
104
105
  sniff_url_parser.add_argument(
105
106
  "url", default="https://cesarcardoso.cc/", help="Url to sniff"
106
107
  )
108
+ sniff_url_parser.add_argument(
109
+ "--metatags",
110
+ default="mt",
111
+ help="Meta tags you want to watch separated by comma ','",
112
+ )
113
+ sniff_url_parser.add_argument(
114
+ "--bodytags",
115
+ default="bd",
116
+ help="Body tags you want to watch separated by comma ','",
117
+ )
107
118
 
108
119
  show_urls_parser = subparsers.add_parser("show", help="Show urls and prefixes")
109
120
  show_urls_parser.add_argument(
@@ -131,6 +142,7 @@ def main():
131
142
  )
132
143
  merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
133
144
 
145
+ clean_cache_parser = subparsers.add_parser("cleancache", help="Clean cache.")
134
146
  args = parser.parse_args()
135
147
 
136
148
  if args.command == "classify-urls":
@@ -153,7 +165,15 @@ def main():
153
165
  return
154
166
 
155
167
  if args.command == "sniff-url":
156
- sniff_url(args.url)
168
+ sniffing_config = {}
169
+ if len(args.metatags) > 0:
170
+ sniffing_config["metatags"] = str(args.metatags).split(",")
171
+
172
+ if len(args.bodytags) > 0:
173
+ sniffing_config["bodytags"] = str(args.bodytags).split(",")
174
+
175
+ sniff_url(args.url, sniffing_config=sniffing_config)
176
+
157
177
  return
158
178
 
159
179
  if args.command == "scrap-urls":
@@ -218,6 +238,9 @@ def main():
218
238
  export_report()
219
239
  return
220
240
 
241
+ if args.command == "cleancache":
242
+ cache.clean()
243
+
221
244
 
222
245
  if __name__ == "__main__":
223
246
  main()
@@ -1,4 +1,5 @@
1
1
  import os
2
+
2
3
  from ohmyscrapper.core import config_files
3
4
 
4
5
  default_app_dir = "ohmyscrapper"
@@ -69,14 +70,14 @@ def url_types_file_exists():
69
70
  def get_url_types():
70
71
  url_types_file = get_files("url_types")
71
72
  return config_files.create_and_read_config_file(
72
- url_types_file, default_app_dir=default_app_dir
73
+ url_types_file, default_app_dir=default_app_dir, complete_file=False
73
74
  )
74
75
 
75
76
 
76
77
  def get_url_sniffing():
77
78
  file = get_files("url_sniffing")
78
79
  return config_files.create_and_read_config_file(
79
- file, default_app_dir=default_app_dir
80
+ file, default_app_dir=default_app_dir, complete_file=False
80
81
  )
81
82
 
82
83
 
@@ -1,8 +1,11 @@
1
1
  import os
2
+
2
3
  import yaml
3
4
 
4
5
 
5
- def create_and_read_config_file(file_name, default_app_dir, force_default=False):
6
+ def create_and_read_config_file(
7
+ file_name, default_app_dir, force_default=False, complete_file=True
8
+ ):
6
9
  config_file = config_file_path(file_name, default_app_dir)
7
10
  default_config_params = _get_default_file(default_file=file_name)
8
11
  if force_default or not os.path.exists(config_file):
@@ -15,17 +18,18 @@ def create_and_read_config_file(file_name, default_app_dir, force_default=False)
15
18
  else:
16
19
  with open(config_file, "r") as f:
17
20
  config_params = yaml.safe_load(f.read())
18
- if complete_config_file(
19
- config_params=config_params,
20
- default_config_params=default_config_params,
21
- file_name=file_name,
22
- default_app_dir=default_app_dir,
23
- ):
24
- config_params = create_and_read_config_file(
21
+ if complete_file:
22
+ if complete_config_file(
23
+ config_params=config_params,
24
+ default_config_params=default_config_params,
25
25
  file_name=file_name,
26
26
  default_app_dir=default_app_dir,
27
- force_default=force_default,
28
- )
27
+ ):
28
+ config_params = create_and_read_config_file(
29
+ file_name=file_name,
30
+ default_app_dir=default_app_dir,
31
+ force_default=force_default,
32
+ )
29
33
 
30
34
  if config_params is None:
31
35
  config_params = create_and_read_config_file(
@@ -2,11 +2,12 @@ db:
2
2
  db_file: local.db
3
3
 
4
4
  default_dirs:
5
- db: ./db
6
- input: ./input
7
- output: ./output
8
- prompts: ./prompts
9
- templates: ./templates
5
+ db: ./ohmyscrapper_db
6
+ input: ./ohmyscrapper_input
7
+ output: ./ohmyscrapper_output
8
+ prompts: ./ohmyscrapper_prompts
9
+ templates: ./ohmyscrapper_templates
10
+ cache: ./ohmyscrapper_cache
10
11
 
11
12
  default_files:
12
13
  url_types: url_types.yaml
@@ -17,3 +18,6 @@ ai:
17
18
 
18
19
  sniffing:
19
20
  timeout: 10
21
+ use-browser: false
22
+ browser-waiting-time: 5
23
+ round-sleeping: 10
@@ -26,4 +26,4 @@ linkedin_redirect:
26
26
 
27
27
  read_all_a_tags:
28
28
  atags:
29
- load_atags: True
29
+ load_links: True
@@ -1,9 +1,11 @@
1
+ import glob
1
2
  import os
2
3
  import sqlite3
3
4
  import time
4
- import glob
5
- import pandas as pd
6
5
  from urllib.parse import urlparse, urlunparse
6
+
7
+ import pandas as pd
8
+
7
9
  from ohmyscrapper.core import config
8
10
 
9
11
 
@@ -71,7 +73,7 @@ def seeds(seeds={}):
71
73
 
72
74
  @use_connection
73
75
  def reset_seeds():
74
- sql = "DELETE FROM urls_valid_prefix"
76
+ sql = "DELETE FROM urls_valid_prefix WHERE 1 = 1"
75
77
  c = conn.cursor()
76
78
  c.execute(sql)
77
79
  conn.commit()
@@ -0,0 +1,28 @@
1
+ from selenium import webdriver
2
+
3
+ from ohmyscrapper.core.config import get_sniffing
4
+
5
+
6
+ def get_driver():
7
+ if get_sniffing("use-browser") == "safari":
8
+ from selenium.webdriver.safari.options import Options
9
+
10
+ options = Options()
11
+ driver = webdriver.Safari(options=options)
12
+ elif get_sniffing("use-browser") == "firefox":
13
+ from selenium.webdriver.firefox.options import Options
14
+
15
+ options = Options()
16
+ driver = webdriver.Firefox(options=options)
17
+ elif get_sniffing("use-browser") == "ie":
18
+ from selenium.webdriver.ie.options import Options
19
+
20
+ options = Options()
21
+ driver = webdriver.Ie(options=options)
22
+ else: # default: chrome
23
+ from selenium.webdriver.chrome.options import Options
24
+
25
+ options = Options()
26
+ driver = webdriver.Chrome(options=options)
27
+
28
+ return driver
@@ -0,0 +1,100 @@
1
+ import os
2
+
3
+ from ohmyscrapper.core import config
4
+
5
+
6
+ def safe_cache_id(func):
7
+ def _filter_cache_id(*args, **kwargs):
8
+ if "cache_id" in args:
9
+ args["cache_id"] = filter_cache_id(args["cache_id"])
10
+
11
+ if "cache_id" in kwargs:
12
+ kwargs["cache_id"] = filter_cache_id(kwargs["cache_id"])
13
+
14
+ return func(*args, **kwargs)
15
+
16
+ return _filter_cache_id
17
+
18
+
19
+ def filter_cache_id(cache_id):
20
+ cache_id = cache_id.replace('"', "").replace("\\", "")
21
+ cache_id = f'"{cache_id}"'
22
+ return cache_id
23
+
24
+
25
+ cache_files_extension = "html"
26
+
27
+
28
+ @safe_cache_id
29
+ def set(text: str, cache_id: str):
30
+ cache_folder = config.get_dir("cache")
31
+ cache_index_file_path = get_cache_index_path()
32
+
33
+ cache_folder_files = os.listdir(cache_folder)
34
+ cached_file_index = get_index_from_file_index(_safe_cache_id=cache_id)
35
+ if cached_file_index is not None:
36
+ new_file_index = cached_file_index
37
+ else:
38
+ new_file_index = len(cache_folder_files)
39
+ with open(cache_index_file_path, "a") as cache_index_file_writer:
40
+ cache_index_file_writer.write(f"\n{new_file_index}: {cache_id}")
41
+
42
+ new_file_name = f"{new_file_index}.{cache_files_extension}"
43
+ new_file_path = os.path.join(cache_folder, new_file_name)
44
+ with open(new_file_path, "w+") as new_file_writer:
45
+ new_file_writer.write(text)
46
+
47
+
48
+ @safe_cache_id
49
+ def get(cache_id: str) -> str:
50
+ cached_file_index = get_index_from_file_index(_safe_cache_id=cache_id)
51
+ code = get_cached_file_by_index(cached_file_index=cached_file_index)
52
+ return code
53
+
54
+
55
+ def get_index_from_file_index(_safe_cache_id):
56
+ cache_index_file = get_cache_index_file()
57
+ if cache_index_file.find(_safe_cache_id) < 1:
58
+ return None
59
+ cache_index_file = cache_index_file[: cache_index_file.find(_safe_cache_id) - 2]
60
+ cached_file_index = int(cache_index_file.split("\n")[-1].strip())
61
+ return cached_file_index
62
+
63
+
64
+ def get_cache_index_path() -> str:
65
+ cache_index_file_name = "cache_index.yaml"
66
+ cache_folder = config.get_dir("cache")
67
+ cache_index_file_path = os.path.join(cache_folder, cache_index_file_name)
68
+ if not os.path.exists(cache_index_file_path):
69
+ with open(cache_index_file_path, "w+") as cache_index_file_writer:
70
+ cache_index_file_writer.write(f"0: {cache_index_file_name}")
71
+
72
+ return cache_index_file_path
73
+
74
+
75
+ def get_cache_index_file() -> str:
76
+ with open(get_cache_index_path(), "r") as f:
77
+ cache_index_file_content = f.read()
78
+
79
+ return cache_index_file_content
80
+
81
+
82
+ def get_cached_file_by_index(cached_file_index: int) -> str:
83
+ code = None
84
+ cache_folder = config.get_dir("cache")
85
+ cached_file_name = f"{cached_file_index}.{cache_files_extension}"
86
+ cached_file_path = os.path.join(cache_folder, cached_file_name)
87
+ if not os.path.exists(cached_file_path):
88
+ return None
89
+ with open(cached_file_path, "r") as cached_file_reader:
90
+ code = cached_file_reader.read()
91
+ return code
92
+
93
+
94
+ def clean():
95
+ cache_folder = config.get_dir("cache")
96
+ cache_folder_files = os.listdir(cache_folder)
97
+ for file in cache_folder_files:
98
+ file_to_clean = os.path.join(cache_folder, file)
99
+ if os.path.exists(file_to_clean):
100
+ os.remove(file_to_clean)
@@ -1,7 +1,9 @@
1
+ import time
2
+
3
+ import pandas as pd
4
+
1
5
  import ohmyscrapper.models.urls_manager as urls_manager
2
6
  from ohmyscrapper.modules import seed
3
- import pandas as pd
4
- import time
5
7
 
6
8
 
7
9
  def classify_urls(recursive=False):
@@ -1,5 +1,7 @@
1
1
  import os
2
+
2
3
  from urlextract import URLExtract
4
+
3
5
  import ohmyscrapper.models.urls_manager as urls_manager
4
6
  from ohmyscrapper.core import config
5
7
 
@@ -1,13 +1,15 @@
1
- import ohmyscrapper.models.urls_manager as urls_manager
2
- from ohmyscrapper.core import config
3
- from bs4 import BeautifulSoup
4
- from google import genai
5
- from dotenv import load_dotenv
1
+ import json
2
+ import os
6
3
  import random
7
4
  import time
8
- import os
5
+
9
6
  import yaml
10
- import json
7
+ from bs4 import BeautifulSoup
8
+ from dotenv import load_dotenv
9
+ from google import genai
10
+
11
+ import ohmyscrapper.models.urls_manager as urls_manager
12
+ from ohmyscrapper.core import config
11
13
 
12
14
  # TODO: !!! REFACTOR !!!
13
15
  load_dotenv()
@@ -85,15 +87,12 @@ def process_with_ai(recursive=True, triggered_times=0):
85
87
 
86
88
  texts = ""
87
89
  for index, row in df.iterrows():
88
- texts = (
89
- texts
90
- + f"""
90
+ texts = texts + f"""
91
91
  <text>
92
92
  <id>{str(row['id'])}</id>
93
93
  {row['description']}
94
94
  </text>
95
95
  """
96
- )
97
96
  if texts == "":
98
97
  print("no urls to process")
99
98
  return
@@ -1,14 +1,15 @@
1
+ import random
2
+ import time
3
+
1
4
  import ohmyscrapper.models.urls_manager as urls_manager
2
- import ohmyscrapper.modules.sniff_url as sniff_url
3
- import ohmyscrapper.modules.load_txt as load_txt
5
+ import ohmyscrapper.modules.browser as browser
4
6
  import ohmyscrapper.modules.classify_urls as classify_urls
7
+ import ohmyscrapper.modules.load_txt as load_txt
8
+ import ohmyscrapper.modules.sniff_url as sniff_url
5
9
  from ohmyscrapper.core import config
6
10
 
7
- import time
8
- import random
9
-
10
11
 
11
- def scrap_url(url, verbose=False):
12
+ def scrap_url(url, verbose=False, driver=None):
12
13
  if url["url_type"] is None:
13
14
  url["url_type"] = "generic"
14
15
 
@@ -32,7 +33,7 @@ def scrap_url(url, verbose=False):
32
33
  sniffing_config = config.get_url_sniffing()
33
34
 
34
35
  url_report = sniff_url.get_tags(
35
- url=url["url"], sniffing_config=sniffing_config[url_type]
36
+ url=url["url"], sniffing_config=sniffing_config[url_type], driver=driver
36
37
  )
37
38
  except Exception as e:
38
39
  urls_manager.set_url_error(url=url["url"], value="error on scrapping")
@@ -147,6 +148,7 @@ def scrap_urls(
147
148
  only_parents=True,
148
149
  verbose=False,
149
150
  n_urls=0,
151
+ driver=None,
150
152
  ):
151
153
  limit = 10
152
154
  classify_urls.classify_urls()
@@ -170,13 +172,19 @@ def scrap_urls(
170
172
  time.sleep(wait)
171
173
 
172
174
  print("🐕 Scrapper is sniffing the url...")
173
- scrap_url(url=url, verbose=verbose)
175
+
176
+ if driver is None and config.get_sniffing("use-browser"):
177
+ driver = browser.get_driver()
178
+ scrap_url(url=url, verbose=verbose, driver=driver)
174
179
 
175
180
  n_urls = n_urls + len(urls)
176
181
  print(f"-- 🗃️ {n_urls} scraped urls...")
177
182
  classify_urls.classify_urls()
178
183
  if recursive:
179
- wait = random.randint(5, 10)
184
+ wait = random.randint(
185
+ int(config.get_sniffing("round-sleeping") / 2),
186
+ int(config.get_sniffing("round-sleeping")),
187
+ )
180
188
  print(
181
189
  f"🐶 Scrapper is sleeping for {wait} seconds before next round of {limit} urls"
182
190
  )
@@ -188,6 +196,7 @@ def scrap_urls(
188
196
  only_parents=only_parents,
189
197
  verbose=verbose,
190
198
  n_urls=n_urls,
199
+ driver=driver,
191
200
  )
192
201
  else:
193
202
  print("scrapping is over...")
@@ -1,10 +1,12 @@
1
- import ohmyscrapper.models.urls_manager as urls_manager
2
- from ohmyscrapper.core import config
3
1
  import math
4
2
  import os
3
+
5
4
  from rich.console import Console
6
5
  from rich.table import Table
7
6
 
7
+ import ohmyscrapper.models.urls_manager as urls_manager
8
+ from ohmyscrapper.core import config
9
+
8
10
 
9
11
  def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
10
12
  output_folder = config.get_dir("output")
@@ -1,16 +1,22 @@
1
+ import json
2
+ import os
3
+ import time
4
+
1
5
  import requests
2
6
  from bs4 import BeautifulSoup
3
- import json
7
+
8
+ import ohmyscrapper.modules.browser as browser
4
9
  from ohmyscrapper.core import config
10
+ from ohmyscrapper.modules import cache
5
11
 
6
12
 
7
13
  def sniff_url(
8
14
  url="https://www.linkedin.com/in/cesardesouzacardoso/",
9
15
  silent=False,
10
16
  sniffing_config={},
17
+ driver=None,
11
18
  ):
12
19
  final_report = {}
13
- final_report["error"] = None
14
20
  if "metatags" in sniffing_config:
15
21
  metatags_to_search = sniffing_config["metatags"]
16
22
  else:
@@ -45,13 +51,14 @@ def sniff_url(
45
51
  print("checking url:", url)
46
52
 
47
53
  try:
48
- r = requests.get(url=url, timeout=config.get_sniffing("timeout"))
49
- soup = BeautifulSoup(r.text, "html.parser")
54
+ r = get_url(url=url, driver=driver)
55
+ soup = BeautifulSoup(r, "html.parser")
50
56
  except requests.exceptions.ReadTimeout:
51
57
  url_domain = url.split("/")[2]
52
58
  final_report["error"] = (
53
59
  f"!!! timeout (10 seconds) while checking the url with domain: `{url_domain}` !!!"
54
60
  )
61
+
55
62
  print(f"\n\n{final_report['error']}\n\n")
56
63
  soup = BeautifulSoup("", "html.parser")
57
64
 
@@ -69,14 +76,14 @@ def sniff_url(
69
76
  soup=soup, silent=silent, body_tags_to_search=body_tags_to_search
70
77
  )
71
78
  )
72
- final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent)
79
+ final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent, url=url)
73
80
  final_report = _complementary_report(final_report, soup, silent).copy()
74
81
  final_report["json"] = json.dumps(final_report)
75
82
 
76
83
  return final_report
77
84
 
78
85
 
79
- def _extract_a_tags(soup, silent):
86
+ def _extract_a_tags(soup, silent, url=None):
80
87
  a_links = []
81
88
  if not silent:
82
89
  print("\n\n\n\n---- all <a> links ---")
@@ -84,12 +91,19 @@ def _extract_a_tags(soup, silent):
84
91
  i = 0
85
92
  for a_tag in soup.find_all("a"):
86
93
  i = i + 1
87
- a_links.append({"text": a_tag.text, "href": a_tag.get("href")})
94
+
95
+ href = a_tag.get("href")
96
+ if href is not None:
97
+ if url is not None and href[:1] == "/":
98
+ domain = url.split("//")[0] + "//" + url.split("//")[1].split("/")[0]
99
+ href = domain + href
100
+
101
+ a_links.append({"text": a_tag.text, "href": href})
88
102
  if not silent:
89
103
  print("\n-- <a> link", i, "-- ")
90
104
  print("target:", a_tag.get("target"))
91
105
  print("text:", str(a_tag.text).strip())
92
- print("href:", a_tag.get("href"))
106
+ print("href:", href)
93
107
  print("-------------- ")
94
108
  return a_links
95
109
 
@@ -124,9 +138,21 @@ def _extract_text_tags(soup, silent, body_tags_to_search):
124
138
  print("\n\n\n\n---- all <text> tags ---\n")
125
139
  i = 0
126
140
  for text_tag, separator in body_tags_to_search.items():
127
- if len(soup.find_all(text_tag)) > 0:
141
+ tag = text_tag
142
+ tag_class = None
143
+ tag_id = None
144
+
145
+ if len(text_tag.split(".")) > 1:
146
+ tag = text_tag.split(".")[0]
147
+ tag_class = text_tag.split(".")[1]
148
+
149
+ if len(text_tag.split("#")) > 1:
150
+ tag = text_tag.split("#")[0]
151
+ tag_id = text_tag.split("#")[1]
152
+
153
+ if len(soup.find_all(tag, class_=tag_class, id=tag_id)) > 0:
128
154
  valid_text_tags[text_tag] = []
129
- for obj_tag in soup.find_all(text_tag):
155
+ for obj_tag in soup.find_all(tag, class_=tag_class, id=tag_id):
130
156
  valid_text_tags[text_tag].append(obj_tag.text.strip())
131
157
  valid_text_tags[text_tag] = separator.join(valid_text_tags[text_tag])
132
158
  i = i + 1
@@ -161,5 +187,34 @@ def _complementary_report(final_report, soup, silent):
161
187
  return final_report
162
188
 
163
189
 
164
- def get_tags(url, sniffing_config={}):
165
- return sniff_url(url=url, silent=True, sniffing_config=sniffing_config)
190
+ def get_tags(url, sniffing_config={}, driver=None):
191
+ return sniff_url(
192
+ url=url, silent=True, sniffing_config=sniffing_config, driver=driver
193
+ )
194
+
195
+
196
+ def get_url(url, driver=None):
197
+ cache_prefix = "sniff-urf:"
198
+ cached_code = cache.get(cache_id=cache_prefix + url)
199
+
200
+ if cached_code is not None:
201
+ print("You used the cache for this URL.")
202
+ return cached_code
203
+
204
+ if driver is None and config.get_sniffing("use-browser"):
205
+ driver = browser.get_driver()
206
+
207
+ if driver is not None:
208
+ try:
209
+ driver.get(url)
210
+ time.sleep(config.get_sniffing("browser-waiting-time"))
211
+ driver.implicitly_wait(config.get_sniffing("browser-waiting-time"))
212
+ code = driver.page_source
213
+ cache.set(text=code, cache_id=cache_prefix + url)
214
+ return code
215
+ except:
216
+ print("error")
217
+ pass
218
+ code = requests.get(url=url, timeout=config.get_sniffing("timeout")).text
219
+ cache.set(text=code, cache_id=cache_prefix + url)
220
+ return code
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ohmyscrapper
3
- Version: 0.7.4
3
+ Version: 0.8.4
4
4
  Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
5
5
  Author: Cesar Cardoso
6
6
  Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
@@ -13,13 +13,14 @@ Requires-Dist: python-dotenv>=1.2.1
13
13
  Requires-Dist: pyyaml>=6.0.3
14
14
  Requires-Dist: requests>=2.32.5
15
15
  Requires-Dist: rich>=14.2.0
16
+ Requires-Dist: selenium>=4.39.0
16
17
  Requires-Dist: urlextract>=1.9.0
17
18
  Requires-Python: >=3.11
18
- Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
19
19
  Project-URL: Repository, https://github.com/bouli/ohmyscrapper
20
+ Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
20
21
  Description-Content-Type: text/markdown
21
22
 
22
- # 🐶 OhMyScrapper - v0.7.4
23
+ # 🐶 OhMyScrapper - v0.8.4
23
24
 
24
25
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
25
26
  final report with general information about job positions.
@@ -0,0 +1,23 @@
1
+ ohmyscrapper/__init__.py,sha256=C_nhLCKrLogCLQVVHlTJNMEOSFiLDTR0bBCtW8T8kXE,7859
2
+ ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
+ ohmyscrapper/core/config.py,sha256=2S8iYMdN5-fCX4EW1cmSd4-XabzqxIgbupBuymV3yjY,3185
4
+ ohmyscrapper/core/config_files.py,sha256=5FyPFpN7WQrlgQWr85s5NF-UbnzbyFsWEVVMOs8iyaw,3411
5
+ ohmyscrapper/core/default_files/config.yaml,sha256=ETuTyFM1fedjehM9cZRoKxRKcYDH0LlPFAZ3vLj0uxU,436
6
+ ohmyscrapper/core/default_files/url_sniffing.yaml,sha256=HUwmGUwuJy7t97bJHgNiZOl1thvD9bLaelPgbEr5bMY,465
7
+ ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
8
+ ohmyscrapper/models/urls_manager.py,sha256=XC8HODdsCEo_nn1j7nH_jy9AUTb4PpmkGlaFWV048TM,12117
9
+ ohmyscrapper/modules/browser.py,sha256=pH41NVqYgay_zEIZfncJbtwz_13REX5HVH8uk581sM4,857
10
+ ohmyscrapper/modules/cache.py,sha256=3EQnv9VYJWrE5fdLwkGEUOAHV16nprhyid6MlBpa9Gg,3228
11
+ ohmyscrapper/modules/classify_urls.py,sha256=oK_UhQPF976cexlarqi14pSw8tWLGYfaIMCXzbAhnpI,1040
12
+ ohmyscrapper/modules/load_txt.py,sha256=Gpob1W_LLfkBnNbtqxgCRNGeyufmHECreDqTlj9O_Mk,4140
13
+ ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
14
+ ohmyscrapper/modules/process_with_ai.py,sha256=TqebqC3_rCx6cbvq3oQhaXLZxGUYpKvhyH3I3zjsA94,7221
15
+ ohmyscrapper/modules/scrap_urls.py,sha256=affq5Vx5BKrl7uL2mpcThDBOXznq0d5fz1if5xAttOA,6627
16
+ ohmyscrapper/modules/seed.py,sha256=hHEGSoPXsmclTaRPeIcK2oC1Xpg3_JqBv_YFMD0m5Jw,1044
17
+ ohmyscrapper/modules/show.py,sha256=i5l8_Zooj6vg1JLqWtvGPWHv7wL53aHZ43-SKS1sF9Y,3879
18
+ ohmyscrapper/modules/sniff_url.py,sha256=NpIMJxNEUzmDkFGVqDJXgVtTWEGKRE_dSiJHNz-vXoE,7027
19
+ ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
20
+ ohmyscrapper-0.8.4.dist-info/WHEEL,sha256=fAguSjoiATBe7TNBkJwOjyL1Tt4wwiaQGtNtjRPNMQA,80
21
+ ohmyscrapper-0.8.4.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
22
+ ohmyscrapper-0.8.4.dist-info/METADATA,sha256=h2Agb2KCKiBkX-HEj_8f9EuV3NOq6AJ9h1WrnPQ79iU,4293
23
+ ohmyscrapper-0.8.4.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: uv 0.9.17
2
+ Generator: uv 0.9.28
3
3
  Root-Is-Purelib: true
4
- Tag: py3-none-any
4
+ Tag: py3-none-any
@@ -1,21 +0,0 @@
1
- ohmyscrapper/__init__.py,sha256=x3wLMhIU744W9DRtXoTrPpWghb7UdC3UJSYZh_gpzlw,7095
2
- ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
3
- ohmyscrapper/core/config.py,sha256=aaSLxk6Fuzp88EMax6MAOX3WszH4OfYLz_dJoXlu0ME,3142
4
- ohmyscrapper/core/config_files.py,sha256=C79-Vgz1E5_jUWtob-yrCyBxsqWEXxqPI_r6TL7D1_Q,3314
5
- ohmyscrapper/core/default_files/config.yaml,sha256=gi8tqhSumQYJIl8QDisJ6eaib2tdcBNT-GFU-e6Dtns,273
6
- ohmyscrapper/core/default_files/url_sniffing.yaml,sha256=RU5GYWmC1PdBl4nn7HUfRBwuXz8Rlap75d4W3zWDzPM,465
7
- ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
8
- ohmyscrapper/models/urls_manager.py,sha256=k0N1If4YoRUWHX80OyBNEeJNIzDROc2ur6j8q2OBlqo,12103
9
- ohmyscrapper/modules/classify_urls.py,sha256=GhiosAQUITy1DQe_PksYV9QRKVTgpkSE28dkutzbWVA,1038
10
- ohmyscrapper/modules/load_txt.py,sha256=pkWBIdh6vORPfENDZ6wGM89vswnOnc1flqKfkLs9RD8,4138
11
- ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
12
- ohmyscrapper/modules/process_with_ai.py,sha256=kl39Jzl-PUwh6AfmTZ9SLFUYs9Sk4biqgt8rNz3X1FA,7255
13
- ohmyscrapper/modules/scrap_urls.py,sha256=uN5j0dychVMGu7n1rcpYdba4sqc47ssyCn0tVaiz-Ic,6264
14
- ohmyscrapper/modules/seed.py,sha256=hHEGSoPXsmclTaRPeIcK2oC1Xpg3_JqBv_YFMD0m5Jw,1044
15
- ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
16
- ohmyscrapper/modules/sniff_url.py,sha256=1QnxEdCWLjLh0uM72dlPzst64qglqg2MHA_xYlNcLSA,5435
17
- ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
18
- ohmyscrapper-0.7.4.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
19
- ohmyscrapper-0.7.4.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
20
- ohmyscrapper-0.7.4.dist-info/METADATA,sha256=CVE8WUcraUtONy9UVIU0y8Y7wjsk4zEmMVfpA_al1CU,4261
21
- ohmyscrapper-0.7.4.dist-info/RECORD,,