ohmyscrapper 0.8.1__tar.gz → 0.8.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/PKG-INFO +2 -2
  2. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/README.md +1 -1
  3. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/pyproject.toml +1 -1
  4. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/__init__.py +20 -2
  5. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/core/default_files/config.yaml +2 -1
  6. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/scrap_urls.py +4 -1
  7. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/__main__.py +0 -0
  8. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/core/config.py +0 -0
  9. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/core/config_files.py +0 -0
  10. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/core/default_files/url_sniffing.yaml +0 -0
  11. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/core/default_files/url_types.yaml +0 -0
  12. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/models/urls_manager.py +0 -0
  13. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/browser.py +0 -0
  14. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/classify_urls.py +0 -0
  15. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/load_txt.py +0 -0
  16. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/merge_dbs.py +0 -0
  17. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/process_with_ai.py +0 -0
  18. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/seed.py +0 -0
  19. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/show.py +0 -0
  20. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/sniff_url.py +0 -0
  21. {ohmyscrapper-0.8.1 → ohmyscrapper-0.8.2}/src/ohmyscrapper/modules/untouch_all.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ohmyscrapper
3
- Version: 0.8.1
3
+ Version: 0.8.2
4
4
  Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
5
5
  Author: Cesar Cardoso
6
6
  Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
@@ -20,7 +20,7 @@ Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
20
20
  Project-URL: Repository, https://github.com/bouli/ohmyscrapper
21
21
  Description-Content-Type: text/markdown
22
22
 
23
- # 🐶 OhMyScrapper - v0.8.1
23
+ # 🐶 OhMyScrapper - v0.8.2
24
24
 
25
25
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
26
26
  final report with general information about job positions.
@@ -1,4 +1,4 @@
1
- # 🐶 OhMyScrapper - v0.8.1
1
+ # 🐶 OhMyScrapper - v0.8.2
2
2
 
3
3
  OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
4
4
  final report with general information about job positions.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ohmyscrapper"
3
- version = "0.8.1"
3
+ version = "0.8.2"
4
4
  license = "MIT"
5
5
  description = "OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions."
6
6
  readme = "README.md"
@@ -20,7 +20,7 @@ from ohmyscrapper.core.config import update
20
20
 
21
21
  def main():
22
22
  parser = argparse.ArgumentParser(prog="ohmyscrapper")
23
- parser.add_argument("--version", action="version", version="%(prog)s v0.8.1")
23
+ parser.add_argument("--version", action="version", version="%(prog)s v0.8.2")
24
24
 
25
25
  update()
26
26
  subparsers = parser.add_subparsers(dest="command", help="Available commands")
@@ -104,6 +104,16 @@ def main():
104
104
  sniff_url_parser.add_argument(
105
105
  "url", default="https://cesarcardoso.cc/", help="Url to sniff"
106
106
  )
107
+ sniff_url_parser.add_argument(
108
+ "--metatags",
109
+ default="mt",
110
+ help="Meta tags you want to watch separated by comma ','",
111
+ )
112
+ sniff_url_parser.add_argument(
113
+ "--bodytags",
114
+ default="bd",
115
+ help="Body tags you want to watch separated by comma ','",
116
+ )
107
117
 
108
118
  show_urls_parser = subparsers.add_parser("show", help="Show urls and prefixes")
109
119
  show_urls_parser.add_argument(
@@ -153,7 +163,15 @@ def main():
153
163
  return
154
164
 
155
165
  if args.command == "sniff-url":
156
- sniff_url(args.url)
166
+ sniffing_config = {}
167
+ if len(args.metatags) > 0:
168
+ sniffing_config["metatags"] = str(args.metatags).split(",")
169
+
170
+ if len(args.bodytags) > 0:
171
+ sniffing_config["bodytags"] = str(args.bodytags).split(",")
172
+
173
+ sniff_url(args.url, sniffing_config=sniffing_config)
174
+
157
175
  return
158
176
 
159
177
  if args.command == "scrap-urls":
@@ -17,5 +17,6 @@ ai:
17
17
 
18
18
  sniffing:
19
19
  timeout: 10
20
- use-browser: chrome
20
+ use-browser: false
21
21
  browser-waiting-time: 5
22
+ round-sleeping: 10
@@ -181,7 +181,10 @@ def scrap_urls(
181
181
  print(f"-- 🗃️ {n_urls} scraped urls...")
182
182
  classify_urls.classify_urls()
183
183
  if recursive:
184
- wait = random.randint(5, 10)
184
+ wait = random.randint(
185
+ int(config.get_sniffing("round-sleeping") / 2),
186
+ int(config.get_sniffing("round-sleeping")),
187
+ )
185
188
  print(
186
189
  f"🐶 Scrapper is sleeping for {wait} seconds before next round of {limit} urls"
187
190
  )