PyPI - ohmyscrapper - Versions diffs - 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl - Mend

ohmyscrapper 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

ohmyscrapper/__init__.py +13 -8
ohmyscrapper/core/config.py +1 -0
ohmyscrapper/core/config_files.py +1 -0
ohmyscrapper/core/default_files/config.yaml +6 -5
ohmyscrapper/models/urls_manager.py +4 -2
ohmyscrapper/modules/browser.py +1 -0
ohmyscrapper/modules/cache.py +100 -0
ohmyscrapper/modules/classify_urls.py +4 -2
ohmyscrapper/modules/load_txt.py +2 -0
ohmyscrapper/modules/process_with_ai.py +10 -11
ohmyscrapper/modules/scrap_urls.py +6 -6
ohmyscrapper/modules/show.py +4 -2
ohmyscrapper/modules/sniff_url.py +24 -8
{ohmyscrapper-0.8.2.dist-info → ohmyscrapper-0.8.4.dist-info}/METADATA +3 -3
ohmyscrapper-0.8.4.dist-info/RECORD +23 -0
{ohmyscrapper-0.8.2.dist-info → ohmyscrapper-0.8.4.dist-info}/WHEEL +2 -2
ohmyscrapper-0.8.2.dist-info/RECORD +0 -22
{ohmyscrapper-0.8.2.dist-info → ohmyscrapper-0.8.4.dist-info}/entry_points.txt +0 -0

ohmyscrapper/__init__.py CHANGED Viewed

@@ -1,26 +1,27 @@
 import argparse
+from ohmyscrapper.core.config import update
+from ohmyscrapper.modules import cache
 from ohmyscrapper.modules.classify_urls import classify_urls
-from ohmyscrapper.modules.sniff_url import sniff_url
 from ohmyscrapper.modules.load_txt import load_txt
-from ohmyscrapper.modules.seed import seed, export_url_types_to_file
+from ohmyscrapper.modules.merge_dbs import merge_dbs
+from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
 from ohmyscrapper.modules.scrap_urls import scrap_urls
+from ohmyscrapper.modules.seed import export_url_types_to_file, seed
 from ohmyscrapper.modules.show import (
+    export_report,
+    export_urls,
     show_url,
     show_urls,
     show_urls_valid_prefix,
-    export_urls,
-    export_report,
 )
+from ohmyscrapper.modules.sniff_url import sniff_url
 from ohmyscrapper.modules.untouch_all import untouch_all
-from ohmyscrapper.modules.process_with_ai import process_with_ai, reprocess_ai_history
-from ohmyscrapper.modules.merge_dbs import merge_dbs
-from ohmyscrapper.core.config import update
 def main():
     parser = argparse.ArgumentParser(prog="ohmyscrapper")
-    parser.add_argument("--version", action="version", version="%(prog)s v0.8.2")
+    parser.add_argument("--version", action="version", version="%(prog)s v0.8.4")
     update()
     subparsers = parser.add_subparsers(dest="command", help="Available commands")
@@ -141,6 +142,7 @@ def main():
     )
     merge_parser = subparsers.add_parser("merge_dbs", help="Merge databases.")
+    clean_cache_parser = subparsers.add_parser("cleancache", help="Clean cache.")
     args = parser.parse_args()
     if args.command == "classify-urls":
@@ -236,6 +238,9 @@ def main():
         export_report()
         return
+    if args.command == "cleancache":
+        cache.clean()
 if __name__ == "__main__":
     main()

ohmyscrapper/core/config.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 from ohmyscrapper.core import config_files
 default_app_dir = "ohmyscrapper"

ohmyscrapper/core/config_files.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import yaml

ohmyscrapper/core/default_files/config.yaml CHANGED Viewed

@@ -2,11 +2,12 @@ db:
   db_file: local.db
 default_dirs:
-  db: ./db
-  input: ./input
-  output: ./output
-  prompts: ./prompts
-  templates: ./templates
+  db: ./ohmyscrapper_db
+  input: ./ohmyscrapper_input
+  output: ./ohmyscrapper_output
+  prompts: ./ohmyscrapper_prompts
+  templates: ./ohmyscrapper_templates
+  cache: ./ohmyscrapper_cache
 default_files:
   url_types: url_types.yaml

ohmyscrapper/models/urls_manager.py CHANGED Viewed

@@ -1,9 +1,11 @@
+import glob
 import os
 import sqlite3
 import time
-import glob
-import pandas as pd
 from urllib.parse import urlparse, urlunparse
+import pandas as pd
 from ohmyscrapper.core import config

ohmyscrapper/modules/browser.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from selenium import webdriver
 from ohmyscrapper.core.config import get_sniffing

ohmyscrapper/modules/cache.py ADDED Viewed

@@ -0,0 +1,100 @@
+import os
+from ohmyscrapper.core import config
+def safe_cache_id(func):
+    def _filter_cache_id(*args, **kwargs):
+        if "cache_id" in args:
+            args["cache_id"] = filter_cache_id(args["cache_id"])
+        if "cache_id" in kwargs:
+            kwargs["cache_id"] = filter_cache_id(kwargs["cache_id"])
+        return func(*args, **kwargs)
+    return _filter_cache_id
+def filter_cache_id(cache_id):
+    cache_id = cache_id.replace('"', "").replace("\\", "")
+    cache_id = f'"{cache_id}"'
+    return cache_id
+cache_files_extension = "html"
+@safe_cache_id
+def set(text: str, cache_id: str):
+    cache_folder = config.get_dir("cache")
+    cache_index_file_path = get_cache_index_path()
+    cache_folder_files = os.listdir(cache_folder)
+    cached_file_index = get_index_from_file_index(_safe_cache_id=cache_id)
+    if cached_file_index is not None:
+        new_file_index = cached_file_index
+    else:
+        new_file_index = len(cache_folder_files)
+        with open(cache_index_file_path, "a") as cache_index_file_writer:
+            cache_index_file_writer.write(f"\n{new_file_index}: {cache_id}")
+    new_file_name = f"{new_file_index}.{cache_files_extension}"
+    new_file_path = os.path.join(cache_folder, new_file_name)
+    with open(new_file_path, "w+") as new_file_writer:
+        new_file_writer.write(text)
+@safe_cache_id
+def get(cache_id: str) -> str:
+    cached_file_index = get_index_from_file_index(_safe_cache_id=cache_id)
+    code = get_cached_file_by_index(cached_file_index=cached_file_index)
+    return code
+def get_index_from_file_index(_safe_cache_id):
+    cache_index_file = get_cache_index_file()
+    if cache_index_file.find(_safe_cache_id) < 1:
+        return None
+    cache_index_file = cache_index_file[: cache_index_file.find(_safe_cache_id) - 2]
+    cached_file_index = int(cache_index_file.split("\n")[-1].strip())
+    return cached_file_index
+def get_cache_index_path() -> str:
+    cache_index_file_name = "cache_index.yaml"
+    cache_folder = config.get_dir("cache")
+    cache_index_file_path = os.path.join(cache_folder, cache_index_file_name)
+    if not os.path.exists(cache_index_file_path):
+        with open(cache_index_file_path, "w+") as cache_index_file_writer:
+            cache_index_file_writer.write(f"0: {cache_index_file_name}")
+    return cache_index_file_path
+def get_cache_index_file() -> str:
+    with open(get_cache_index_path(), "r") as f:
+        cache_index_file_content = f.read()
+    return cache_index_file_content
+def get_cached_file_by_index(cached_file_index: int) -> str:
+    code = None
+    cache_folder = config.get_dir("cache")
+    cached_file_name = f"{cached_file_index}.{cache_files_extension}"
+    cached_file_path = os.path.join(cache_folder, cached_file_name)
+    if not os.path.exists(cached_file_path):
+        return None
+    with open(cached_file_path, "r") as cached_file_reader:
+        code = cached_file_reader.read()
+    return code
+def clean():
+    cache_folder = config.get_dir("cache")
+    cache_folder_files = os.listdir(cache_folder)
+    for file in cache_folder_files:
+        file_to_clean = os.path.join(cache_folder, file)
+        if os.path.exists(file_to_clean):
+            os.remove(file_to_clean)

ohmyscrapper/modules/classify_urls.py CHANGED Viewed

@@ -1,7 +1,9 @@
+import time
+import pandas as pd
 import ohmyscrapper.models.urls_manager as urls_manager
 from ohmyscrapper.modules import seed
-import pandas as pd
-import time
 def classify_urls(recursive=False):

ohmyscrapper/modules/load_txt.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
 from urlextract import URLExtract
 import ohmyscrapper.models.urls_manager as urls_manager
 from ohmyscrapper.core import config

ohmyscrapper/modules/process_with_ai.py CHANGED Viewed

@@ -1,13 +1,15 @@
-import ohmyscrapper.models.urls_manager as urls_manager
-from ohmyscrapper.core import config
-from bs4 import BeautifulSoup
-from google import genai
-from dotenv import load_dotenv
+import json
+import os
 import random
 import time
-import os
 import yaml
-import json
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+from google import genai
+import ohmyscrapper.models.urls_manager as urls_manager
+from ohmyscrapper.core import config
 # TODO: !!! REFACTOR !!!
 load_dotenv()
@@ -85,15 +87,12 @@ def process_with_ai(recursive=True, triggered_times=0):
     texts = ""
     for index, row in df.iterrows():
-        texts = (
-            texts
-            + f"""
+        texts = texts + f"""
         <text>
         <id>{str(row['id'])}</id>
         {row['description']}
         </text>
         """
-        )
     if texts == "":
         print("no urls to process")
         return

ohmyscrapper/modules/scrap_urls.py CHANGED Viewed

@@ -1,13 +1,13 @@
+import random
+import time
 import ohmyscrapper.models.urls_manager as urls_manager
-import ohmyscrapper.modules.sniff_url as sniff_url
-import ohmyscrapper.modules.load_txt as load_txt
-import ohmyscrapper.modules.classify_urls as classify_urls
 import ohmyscrapper.modules.browser as browser
+import ohmyscrapper.modules.classify_urls as classify_urls
+import ohmyscrapper.modules.load_txt as load_txt
+import ohmyscrapper.modules.sniff_url as sniff_url
 from ohmyscrapper.core import config
-import time
-import random
 def scrap_url(url, verbose=False, driver=None):
     if url["url_type"] is None:

ohmyscrapper/modules/show.py CHANGED Viewed

@@ -1,10 +1,12 @@
-import ohmyscrapper.models.urls_manager as urls_manager
-from ohmyscrapper.core import config
 import math
 import os
 from rich.console import Console
 from rich.table import Table
+import ohmyscrapper.models.urls_manager as urls_manager
+from ohmyscrapper.core import config
 def export_urls(limit=0, csv_file="output/urls.csv", simplify=False):
     output_folder = config.get_dir("output")

ohmyscrapper/modules/sniff_url.py CHANGED Viewed

@@ -1,9 +1,13 @@
+import json
+import os
+import time
 import requests
 from bs4 import BeautifulSoup
-import json
-from ohmyscrapper.core import config
 import ohmyscrapper.modules.browser as browser
-import time
+from ohmyscrapper.core import config
+from ohmyscrapper.modules import cache
 def sniff_url(
@@ -89,9 +93,10 @@ def _extract_a_tags(soup, silent, url=None):
         i = i + 1
         href = a_tag.get("href")
-        if url is not None and href[:1] == "/":
-            domain = url.split("//")[0] + "//" + url.split("//")[1].split("/")[0]
-            href = domain + href
+        if href is not None:
+            if url is not None and href[:1] == "/":
+                domain = url.split("//")[0] + "//" + url.split("//")[1].split("/")[0]
+                href = domain + href
         a_links.append({"text": a_tag.text, "href": href})
         if not silent:
@@ -189,6 +194,13 @@ def get_tags(url, sniffing_config={}, driver=None):
 def get_url(url, driver=None):
+    cache_prefix = "sniff-urf:"
+    cached_code = cache.get(cache_id=cache_prefix + url)
+    if cached_code is not None:
+        print("You used the cache for this URL.")
+        return cached_code
     if driver is None and config.get_sniffing("use-browser"):
         driver = browser.get_driver()
@@ -197,8 +209,12 @@ def get_url(url, driver=None):
             driver.get(url)
             time.sleep(config.get_sniffing("browser-waiting-time"))
             driver.implicitly_wait(config.get_sniffing("browser-waiting-time"))
-            return driver.page_source
+            code = driver.page_source
+            cache.set(text=code, cache_id=cache_prefix + url)
+            return code
         except:
             print("error")
             pass
-    return requests.get(url=url, timeout=config.get_sniffing("timeout")).text
+    code = requests.get(url=url, timeout=config.get_sniffing("timeout")).text
+    cache.set(text=code, cache_id=cache_prefix + url)
+    return code

{ohmyscrapper-0.8.2.dist-info → ohmyscrapper-0.8.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ohmyscrapper
-Version: 0.8.2
+Version: 0.8.4
 Summary: OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a final report with general information about job positions.
 Author: Cesar Cardoso
 Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
@@ -16,11 +16,11 @@ Requires-Dist: rich>=14.2.0
 Requires-Dist: selenium>=4.39.0
 Requires-Dist: urlextract>=1.9.0
 Requires-Python: >=3.11
-Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
 Project-URL: Repository, https://github.com/bouli/ohmyscrapper
+Project-URL: Changelog, https://github.com/bouli/ohmyscrapper/releases/latest
 Description-Content-Type: text/markdown
-# 🐶 OhMyScrapper - v0.8.2
+# 🐶 OhMyScrapper - v0.8.4
 OhMyScrapper scrapes texts and urls looking for links and jobs-data to create a
 final report with general information about job positions.

ohmyscrapper-0.8.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,23 @@
+ohmyscrapper/__init__.py,sha256=C_nhLCKrLogCLQVVHlTJNMEOSFiLDTR0bBCtW8T8kXE,7859
+ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
+ohmyscrapper/core/config.py,sha256=2S8iYMdN5-fCX4EW1cmSd4-XabzqxIgbupBuymV3yjY,3185
+ohmyscrapper/core/config_files.py,sha256=5FyPFpN7WQrlgQWr85s5NF-UbnzbyFsWEVVMOs8iyaw,3411
+ohmyscrapper/core/default_files/config.yaml,sha256=ETuTyFM1fedjehM9cZRoKxRKcYDH0LlPFAZ3vLj0uxU,436
+ohmyscrapper/core/default_files/url_sniffing.yaml,sha256=HUwmGUwuJy7t97bJHgNiZOl1thvD9bLaelPgbEr5bMY,465
+ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
+ohmyscrapper/models/urls_manager.py,sha256=XC8HODdsCEo_nn1j7nH_jy9AUTb4PpmkGlaFWV048TM,12117
+ohmyscrapper/modules/browser.py,sha256=pH41NVqYgay_zEIZfncJbtwz_13REX5HVH8uk581sM4,857
+ohmyscrapper/modules/cache.py,sha256=3EQnv9VYJWrE5fdLwkGEUOAHV16nprhyid6MlBpa9Gg,3228
+ohmyscrapper/modules/classify_urls.py,sha256=oK_UhQPF976cexlarqi14pSw8tWLGYfaIMCXzbAhnpI,1040
+ohmyscrapper/modules/load_txt.py,sha256=Gpob1W_LLfkBnNbtqxgCRNGeyufmHECreDqTlj9O_Mk,4140
+ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
+ohmyscrapper/modules/process_with_ai.py,sha256=TqebqC3_rCx6cbvq3oQhaXLZxGUYpKvhyH3I3zjsA94,7221
+ohmyscrapper/modules/scrap_urls.py,sha256=affq5Vx5BKrl7uL2mpcThDBOXznq0d5fz1if5xAttOA,6627
+ohmyscrapper/modules/seed.py,sha256=hHEGSoPXsmclTaRPeIcK2oC1Xpg3_JqBv_YFMD0m5Jw,1044
+ohmyscrapper/modules/show.py,sha256=i5l8_Zooj6vg1JLqWtvGPWHv7wL53aHZ43-SKS1sF9Y,3879
+ohmyscrapper/modules/sniff_url.py,sha256=NpIMJxNEUzmDkFGVqDJXgVtTWEGKRE_dSiJHNz-vXoE,7027
+ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
+ohmyscrapper-0.8.4.dist-info/WHEEL,sha256=fAguSjoiATBe7TNBkJwOjyL1Tt4wwiaQGtNtjRPNMQA,80
+ohmyscrapper-0.8.4.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
+ohmyscrapper-0.8.4.dist-info/METADATA,sha256=h2Agb2KCKiBkX-HEj_8f9EuV3NOq6AJ9h1WrnPQ79iU,4293
+ohmyscrapper-0.8.4.dist-info/RECORD,,

{ohmyscrapper-0.8.2.dist-info → ohmyscrapper-0.8.4.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: uv 0.9.17
+Generator: uv 0.9.28
 Root-Is-Purelib: true
-Tag: py3-none-any
+Tag: py3-none-any

ohmyscrapper-0.8.2.dist-info/RECORD DELETED Viewed

@@ -1,22 +0,0 @@
-ohmyscrapper/__init__.py,sha256=WzXXhhlVkyAPbqeo7NgezLb6TbEJcuFf8JYAdcf3zBE,7678
-ohmyscrapper/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
-ohmyscrapper/core/config.py,sha256=bfmoTr1j4SnIhKFZC_F9fh4Y90VqSPFf_g6Rm-aNui4,3184
-ohmyscrapper/core/config_files.py,sha256=3mIXVxurmyXCpKueyyGsZ6lUnV8VJ2gnLU2QaqhWhhI,3410
-ohmyscrapper/core/default_files/config.yaml,sha256=y54QAjOnogpl8LEzhmn89tAfRzle4ZWWtIbYRjxX8Rk,341
-ohmyscrapper/core/default_files/url_sniffing.yaml,sha256=HUwmGUwuJy7t97bJHgNiZOl1thvD9bLaelPgbEr5bMY,465
-ohmyscrapper/core/default_files/url_types.yaml,sha256=20kvv8_iWRT-pLa014RXYpAmPSonn6tDnG302rx7l-o,228
-ohmyscrapper/models/urls_manager.py,sha256=sP2T4k1HOj8ccaVGWbuhfw1BDfOUSVL4_WR9vRyHjOA,12115
-ohmyscrapper/modules/browser.py,sha256=6AaNFQ7jV91DvHqbsBT6It_-tNbVN2qJC_c1vXTweJY,856
-ohmyscrapper/modules/classify_urls.py,sha256=GhiosAQUITy1DQe_PksYV9QRKVTgpkSE28dkutzbWVA,1038
-ohmyscrapper/modules/load_txt.py,sha256=pkWBIdh6vORPfENDZ6wGM89vswnOnc1flqKfkLs9RD8,4138
-ohmyscrapper/modules/merge_dbs.py,sha256=0pK3PPUGSbnaDkdpQUGCHemOVaKO37bfHwnsy_EVpWQ,115
-ohmyscrapper/modules/process_with_ai.py,sha256=kl39Jzl-PUwh6AfmTZ9SLFUYs9Sk4biqgt8rNz3X1FA,7255
-ohmyscrapper/modules/scrap_urls.py,sha256=_e4jT7eBWGP6cqI6RaD0xzNX1vCFBx96JIBPGW3mAV4,6627
-ohmyscrapper/modules/seed.py,sha256=hHEGSoPXsmclTaRPeIcK2oC1Xpg3_JqBv_YFMD0m5Jw,1044
-ohmyscrapper/modules/show.py,sha256=jsAs4g8ouA9wymkBfkDCbpVWKD-m_20uKG-m1cZAUGA,3877
-ohmyscrapper/modules/sniff_url.py,sha256=BZphbr2V7MDyFPW7APlh7_CLTtc_u3kcB7DY2QjVVQo,6579
-ohmyscrapper/modules/untouch_all.py,sha256=DAwWYfqMFifHPtFCxSamu0AxHCgk6aJbTnBy6wLucXM,167
-ohmyscrapper-0.8.2.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
-ohmyscrapper-0.8.2.dist-info/entry_points.txt,sha256=BZud6D16XkfjelDa4Z33mji-KJbbZXgq2FoLrzjru5I,52
-ohmyscrapper-0.8.2.dist-info/METADATA,sha256=2OnXXefFcRT_ChEVdL6LKZoe6iNKyFqjoN10LBUBB34,4293
-ohmyscrapper-0.8.2.dist-info/RECORD,,

{ohmyscrapper-0.8.2.dist-info → ohmyscrapper-0.8.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

ohmyscrapper 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl

ohmyscrapper 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl