ohmyscrapper 0.7.4__tar.gz → 0.8.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/PKG-INFO +3 -2
  2. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/README.md +1 -1
  3. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/pyproject.toml +3 -2
  4. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/__init__.py +20 -2
  5. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/core/config.py +2 -2
  6. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/core/config_files.py +13 -10
  7. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/core/default_files/config.yaml +3 -0
  8. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/core/default_files/url_sniffing.yaml +1 -1
  9. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/models/urls_manager.py +1 -1
  10. ohmyscrapper-0.8.2/src/ohmyscrapper/modules/browser.py +27 -0
  11. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/scrap_urls.py +13 -4
  12. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/sniff_url.py +50 -11
  13. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/__main__.py +0 -0
  14. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/core/default_files/url_types.yaml +0 -0
  15. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/classify_urls.py +0 -0
  16. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/load_txt.py +0 -0
  17. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/merge_dbs.py +0 -0
  18. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/process_with_ai.py +0 -0
  19. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/seed.py +0 -0
  20. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/show.py +0 -0
  21. {ohmyscrapper-0.7.4 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/untouch_all.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ohmyscrapper
3
- Version: 0.7.4
3
+ Version: 0.8.2
4
4
  Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
5
5
  Author: Cesar Cardoso
6
6
  Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
@@ -13,13 +13,14 @@ Requires-Dist: python-dotenv>=1.2.1
13
13
  Requires-Dist: pyyaml>=6.0.3
14
14
  Requires-Dist: requests>=2.32.5
15
15
  Requires-Dist: rich>=14.2.0
16
+ Requires-Dist: selenium>=4.39.0
16
17
  Requires-Dist: urlextract>=1.9.0
17
18
  Requires-Python: >=3.11
18
19
  Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
19
20
  Project-URL: Repository, https://github.com/bouli/ohmyscrapper
20
21
  Description-Content-Type: text/markdown
21
22
 
22
- # 🐶 OhMyScrapper - v0.7.4
23
+ # 🐶 OhMyScrapper - v0.8.2
23
24
 
24
25
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
25
26
  final report with general information about job positions.
@@ -1,4 +1,4 @@
1
- # 🐶 OhMyScrapper - v0.7.4
1
+ # 🐶 OhMyScrapper - v0.8.2
2
2
 
3
3
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
4
4
  final report with general information about job positions.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ohmyscrapper"
3
- version = "0.7.4"
3
+ version = "0.8.2"
4
4
  license = "MIT"
5
5
  description = "OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions."
6
6
  readme = "README.md"
@@ -17,6 +17,7 @@ dependencies = [
17
17
  "pyyaml>=6.0.3",
18
18
  "requests>=2.32.5",
19
19
  "rich>=14.2.0",
20
+ "selenium>=4.39.0",
20
21
  "urlextract>=1.9.0",
21
22
  ]
22
23
 
@@ -34,7 +35,7 @@ build-backend = "uv_build"
34
35
  [tool.bumpversion]
35
36
  tag = true
36
37
  tag_name = "v{new_version}"
37
- pre_commit_hooks = ["uvx black ./src", "git add src", "git commit -m 'chore: Beautify with black'", "uv sync --upgrade", "git add uv.lock"]
38
+ pre_commit_hooks = ["uvx black ./src", "git add src", "uv sync --upgrade", "git add uv.lock"]
38
39
  commit = true
39
40
 
40
41
  [[tool.bumpversion.files]]
@@ -20,7 +20,7 @@ from ohmyscrapper.core.config import update
20
20
 
21
21
  def main():
22
22
  parser = argparse.ArgumentParser(prog="ohmyscrapper")
23
- parser.add_argument("--version", action="version", version="%(prog)s v0.7.4")
23
+ parser.add_argument("--version", action="version", version="%(prog)s v0.8.2")
24
24
 
25
25
  update()
26
26
  subparsers = parser.add_subparsers(dest="command", help="Available commands")
@@ -104,6 +104,16 @@ def main():
104
104
  sniff_url_parser.add_argument(
105
105
  "url", default="https://cesarcardoso.cc/", help="Url to sniff"
106
106
  )
107
+ sniff_url_parser.add_argument(
108
+ "--metatags",
109
+ default="mt",
110
+ help="Meta tags you want to watch separated by comma ','",
111
+ )
112
+ sniff_url_parser.add_argument(
113
+ "--bodytags",
114
+ default="bd",
115
+ help="Body tags you want to watch separated by comma ','",
116
+ )
107
117
 
108
118
  show_urls_parser = subparsers.add_parser("show", help="Show urls and prefixes")
109
119
  show_urls_parser.add_argument(
@@ -153,7 +163,15 @@ def main():
153
163
  return
154
164
 
155
165
  if args.command == "sniff-url":
156
- sniff_url(args.url)
166
+ sniffing_config = {}
167
+ if len(args.metatags) > 0:
168
+ sniffing_config["metatags"] = str(args.metatags).split(",")
169
+
170
+ if len(args.bodytags) > 0:
171
+ sniffing_config["bodytags"] = str(args.bodytags).split(",")
172
+
173
+ sniff_url(args.url, sniffing_config=sniffing_config)
174
+
157
175
  return
158
176
 
159
177
  if args.command == "scrap-urls":
@@ -69,14 +69,14 @@ def url_types_file_exists():
69
69
  def get_url_types():
70
70
  url_types_file = get_files("url_types")
71
71
  return config_files.create_and_read_config_file(
72
- url_types_file, default_app_dir=default_app_dir
72
+ url_types_file, default_app_dir=default_app_dir, complete_file=False
73
73
  )
74
74
 
75
75
 
76
76
  def get_url_sniffing():
77
77
  file = get_files("url_sniffing")
78
78
  return config_files.create_and_read_config_file(
79
- file, default_app_dir=default_app_dir
79
+ file, default_app_dir=default_app_dir, complete_file=False
80
80
  )
81
81
 
82
82
 
@@ -2,7 +2,9 @@ import os
2
2
  import yaml
3
3
 
4
4
 
5
- def create_and_read_config_file(file_name, default_app_dir, force_default=False):
5
+ def create_and_read_config_file(
6
+ file_name, default_app_dir, force_default=False, complete_file=True
7
+ ):
6
8
  config_file = config_file_path(file_name, default_app_dir)
7
9
  default_config_params = _get_default_file(default_file=file_name)
8
10
  if force_default or not os.path.exists(config_file):
@@ -15,17 +17,18 @@ def create_and_read_config_file(file_name, default_app_dir, force_default=False)
15
17
  else:
16
18
  with open(config_file, "r") as f:
17
19
  config_params = yaml.safe_load(f.read())
18
- if complete_config_file(
19
- config_params=config_params,
20
- default_config_params=default_config_params,
21
- file_name=file_name,
22
- default_app_dir=default_app_dir,
23
- ):
24
- config_params = create_and_read_config_file(
20
+ if complete_file:
21
+ if complete_config_file(
22
+ config_params=config_params,
23
+ default_config_params=default_config_params,
25
24
  file_name=file_name,
26
25
  default_app_dir=default_app_dir,
27
- force_default=force_default,
28
- )
26
+ ):
27
+ config_params = create_and_read_config_file(
28
+ file_name=file_name,
29
+ default_app_dir=default_app_dir,
30
+ force_default=force_default,
31
+ )
29
32
 
30
33
  if config_params is None:
31
34
  config_params = create_and_read_config_file(
@@ -17,3 +17,6 @@ ai:
17
17
 
18
18
  sniffing:
19
19
  timeout: 10
20
+ use-browser: false
21
+ browser-waiting-time: 5
22
+ round-sleeping: 10
@@ -26,4 +26,4 @@ linkedin_redirect:
26
26
 
27
27
  read_all_a_tags:
28
28
  atags:
29
- load_atags: True
29
+ load_links: True
@@ -71,7 +71,7 @@ def seeds(seeds={}):
71
71
 
72
72
  @use_connection
73
73
  def reset_seeds():
74
- sql = "DELETE FROM urls_valid_prefix"
74
+ sql = "DELETE FROM urls_valid_prefix WHERE 1 = 1"
75
75
  c = conn.cursor()
76
76
  c.execute(sql)
77
77
  conn.commit()
@@ -0,0 +1,27 @@
1
+ from selenium import webdriver
2
+ from ohmyscrapper.core.config import get_sniffing
3
+
4
+
5
+ def get_driver():
6
+ if get_sniffing("use-browser") == "safari":
7
+ from selenium.webdriver.safari.options import Options
8
+
9
+ options = Options()
10
+ driver = webdriver.Safari(options=options)
11
+ elif get_sniffing("use-browser") == "firefox":
12
+ from selenium.webdriver.firefox.options import Options
13
+
14
+ options = Options()
15
+ driver = webdriver.Firefox(options=options)
16
+ elif get_sniffing("use-browser") == "ie":
17
+ from selenium.webdriver.ie.options import Options
18
+
19
+ options = Options()
20
+ driver = webdriver.Ie(options=options)
21
+ else: # default: chrome
22
+ from selenium.webdriver.chrome.options import Options
23
+
24
+ options = Options()
25
+ driver = webdriver.Chrome(options=options)
26
+
27
+ return driver
@@ -2,13 +2,14 @@ import ohmyscrapper.models.urls_manager as urls_manager
2
2
  import ohmyscrapper.modules.sniff_url as sniff_url
3
3
  import ohmyscrapper.modules.load_txt as load_txt
4
4
  import ohmyscrapper.modules.classify_urls as classify_urls
5
+ import ohmyscrapper.modules.browser as browser
5
6
  from ohmyscrapper.core import config
6
7
 
7
8
  import time
8
9
  import random
9
10
 
10
11
 
11
- def scrap_url(url, verbose=False):
12
+ def scrap_url(url, verbose=False, driver=None):
12
13
  if url["url_type"] is None:
13
14
  url["url_type"] = "generic"
14
15
 
@@ -32,7 +33,7 @@ def scrap_url(url, verbose=False):
32
33
  sniffing_config = config.get_url_sniffing()
33
34
 
34
35
  url_report = sniff_url.get_tags(
35
- url=url["url"], sniffing_config=sniffing_config[url_type]
36
+ url=url["url"], sniffing_config=sniffing_config[url_type], driver=driver
36
37
  )
37
38
  except Exception as e:
38
39
  urls_manager.set_url_error(url=url["url"], value="error on scrapping")
@@ -147,6 +148,7 @@ def scrap_urls(
147
148
  only_parents=True,
148
149
  verbose=False,
149
150
  n_urls=0,
151
+ driver=None,
150
152
  ):
151
153
  limit = 10
152
154
  classify_urls.classify_urls()
@@ -170,13 +172,19 @@ def scrap_urls(
170
172
  time.sleep(wait)
171
173
 
172
174
  print("🐕 Scrapper is sniffing the url...")
173
- scrap_url(url=url, verbose=verbose)
175
+
176
+ if driver is None and config.get_sniffing("use-browser"):
177
+ driver = browser.get_driver()
178
+ scrap_url(url=url, verbose=verbose, driver=driver)
174
179
 
175
180
  n_urls = n_urls + len(urls)
176
181
  print(f"-- 🗃️ {n_urls} scraped urls...")
177
182
  classify_urls.classify_urls()
178
183
  if recursive:
179
- wait = random.randint(5, 10)
184
+ wait = random.randint(
185
+ int(config.get_sniffing("round-sleeping") / 2),
186
+ int(config.get_sniffing("round-sleeping")),
187
+ )
180
188
  print(
181
189
  f"🐶 Scrapper is sleeping for {wait} seconds before next round of {limit} urls"
182
190
  )
@@ -188,6 +196,7 @@ def scrap_urls(
188
196
  only_parents=only_parents,
189
197
  verbose=verbose,
190
198
  n_urls=n_urls,
199
+ driver=driver,
191
200
  )
192
201
  else:
193
202
  print("scrapping is over...")
@@ -2,15 +2,17 @@ import requests
2
2
  from bs4 import BeautifulSoup
3
3
  import json
4
4
  from ohmyscrapper.core import config
5
+ import ohmyscrapper.modules.browser as browser
6
+ import time
5
7
 
6
8
 
7
9
  def sniff_url(
8
10
  url="https://www.linkedin.com/in/cesardesouzacardoso/",
9
11
  silent=False,
10
12
  sniffing_config={},
13
+ driver=None,
11
14
  ):
12
15
  final_report = {}
13
- final_report["error"] = None
14
16
  if "metatags" in sniffing_config:
15
17
  metatags_to_search = sniffing_config["metatags"]
16
18
  else:
@@ -45,13 +47,14 @@ def sniff_url(
45
47
  print("checking url:", url)
46
48
 
47
49
  try:
48
- r = requests.get(url=url, timeout=config.get_sniffing("timeout"))
49
- soup = BeautifulSoup(r.text, "html.parser")
50
+ r = get_url(url=url, driver=driver)
51
+ soup = BeautifulSoup(r, "html.parser")
50
52
  except requests.exceptions.ReadTimeout:
51
53
  url_domain = url.split("/")[2]
52
54
  final_report["error"] = (
53
55
  f"!!! timeout (10 seconds) while checking the url with domain: `{url_domain}` !!!"
54
56
  )
57
+
55
58
  print(f"\n\n{final_report['error']}\n\n")
56
59
  soup = BeautifulSoup("", "html.parser")
57
60
 
@@ -69,14 +72,14 @@ def sniff_url(
69
72
  soup=soup, silent=silent, body_tags_to_search=body_tags_to_search
70
73
  )
71
74
  )
72
- final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent)
75
+ final_report["a_links"] = _extract_a_tags(soup=soup, silent=silent, url=url)
73
76
  final_report = _complementary_report(final_report, soup, silent).copy()
74
77
  final_report["json"] = json.dumps(final_report)
75
78
 
76
79
  return final_report
77
80
 
78
81
 
79
- def _extract_a_tags(soup, silent):
82
+ def _extract_a_tags(soup, silent, url=None):
80
83
  a_links = []
81
84
  if not silent:
82
85
  print("\n\n\n\n---- all <a> links ---")
@@ -84,12 +87,18 @@ def _extract_a_tags(soup, silent):
84
87
  i = 0
85
88
  for a_tag in soup.find_all("a"):
86
89
  i = i + 1
87
- a_links.append({"text": a_tag.text, "href": a_tag.get("href")})
90
+
91
+ href = a_tag.get("href")
92
+ if url is not None and href[:1] == "/":
93
+ domain = url.split("//")[0] + "//" + url.split("//")[1].split("/")[0]
94
+ href = domain + href
95
+
96
+ a_links.append({"text": a_tag.text, "href": href})
88
97
  if not silent:
89
98
  print("\n-- <a> link", i, "-- ")
90
99
  print("target:", a_tag.get("target"))
91
100
  print("text:", str(a_tag.text).strip())
92
- print("href:", a_tag.get("href"))
101
+ print("href:", href)
93
102
  print("-------------- ")
94
103
  return a_links
95
104
 
@@ -124,9 +133,21 @@ def _extract_text_tags(soup, silent, body_tags_to_search):
124
133
  print("\n\n\n\n---- all <text> tags ---\n")
125
134
  i = 0
126
135
  for text_tag, separator in body_tags_to_search.items():
127
- if len(soup.find_all(text_tag)) > 0:
136
+ tag = text_tag
137
+ tag_class = None
138
+ tag_id = None
139
+
140
+ if len(text_tag.split(".")) > 1:
141
+ tag = text_tag.split(".")[0]
142
+ tag_class = text_tag.split(".")[1]
143
+
144
+ if len(text_tag.split("#")) > 1:
145
+ tag = text_tag.split("#")[0]
146
+ tag_id = text_tag.split("#")[1]
147
+
148
+ if len(soup.find_all(tag, class_=tag_class, id=tag_id)) > 0:
128
149
  valid_text_tags[text_tag] = []
129
- for obj_tag in soup.find_all(text_tag):
150
+ for obj_tag in soup.find_all(tag, class_=tag_class, id=tag_id):
130
151
  valid_text_tags[text_tag].append(obj_tag.text.strip())
131
152
  valid_text_tags[text_tag] = separator.join(valid_text_tags[text_tag])
132
153
  i = i + 1
@@ -161,5 +182,23 @@ def _complementary_report(final_report, soup, silent):
161
182
  return final_report
162
183
 
163
184
 
164
- def get_tags(url, sniffing_config={}):
165
- return sniff_url(url=url, silent=True, sniffing_config=sniffing_config)
185
+ def get_tags(url, sniffing_config={}, driver=None):
186
+ return sniff_url(
187
+ url=url, silent=True, sniffing_config=sniffing_config, driver=driver
188
+ )
189
+
190
+
191
+ def get_url(url, driver=None):
192
+ if driver is None and config.get_sniffing("use-browser"):
193
+ driver = browser.get_driver()
194
+
195
+ if driver is not None:
196
+ try:
197
+ driver.get(url)
198
+ time.sleep(config.get_sniffing("browser-waiting-time"))
199
+ driver.implicitly_wait(config.get_sniffing("browser-waiting-time"))
200
+ return driver.page_source
201
+ except:
202
+ print("error")
203
+ pass
204
+ return requests.get(url=url, timeout=config.get_sniffing("timeout")).text