PyPI - ohmyscrapper - Versions diffs - 0.2.3__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

ohmyscrapper 0.2.3py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

ohmyscrapper/__init__.py +44 -22
ohmyscrapper/core/config.py +107 -0
ohmyscrapper/core/config_files.py +73 -0
ohmyscrapper/core/default_files/config.yaml +16 -0
ohmyscrapper/core/default_files/url_sniffing.yaml +25 -0
ohmyscrapper/core/default_files/url_types.yaml +5 -0
ohmyscrapper/models/urls_manager.py +95 -41
ohmyscrapper/modules/classify_urls.py +14 -6
ohmyscrapper/modules/load_txt.py +79 -11
ohmyscrapper/modules/process_with_ai.py +72 -36
ohmyscrapper/modules/scrap_urls.py +130 -121
ohmyscrapper/modules/seed.py +28 -2
ohmyscrapper/modules/show.py +22 -14
ohmyscrapper/modules/sniff_url.py +112 -45
ohmyscrapper/modules/untouch_all.py +1 -1
{ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.7.0.dist-info}/METADATA +21 -15
ohmyscrapper-0.7.0.dist-info/RECORD +21 -0
ohmyscrapper-0.2.3.dist-info/RECORD +0 -16
{ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.7.0.dist-info}/WHEEL +0 -0
{ohmyscrapper-0.2.3.dist-info → ohmyscrapper-0.7.0.dist-info}/entry_points.txt +0 -0

ohmyscrapper/models/urls_manager.py CHANGED Viewed

@@ -4,31 +4,45 @@ import time
 import glob
 import pandas as pd
 from urllib.parse import urlparse, urlunparse
+from ohmyscrapper.core import config
 def get_db_dir():
-    if not os.path.exists("db"):
-        os.mkdir("db")
-    return "db"
+    db_folder = config.get_dir("db")
+    if not os.path.exists(db_folder):
+        os.mkdir(db_folder)
+    return db_folder
 def get_db_path():
-    return get_db_dir() + "/local.db"
+    db_file = config.get_db()
+    return os.path.join(get_db_dir(), db_file)
 def get_db_connection():
+    if not os.path.exists(get_db_path()):
+        create_tables(sqlite3.connect(get_db_path()))
     return sqlite3.connect(get_db_path())
-# TODO: check if it makes sense
-conn = get_db_connection()
+def use_connection(func):
+    def provide_connection(*args, **kwargs):
+        global conn
+        with get_db_connection() as conn:
+            try:
+                return func(*args, **kwargs)
+            except:
+                update_db()
+                return func(*args, **kwargs)
+    return provide_connection
-def create_tables():
+def create_tables(conn):
     c = conn.cursor()
     c.execute(
-        "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, h1 TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
+        "CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY, url_type STRING, parent_url TEXT, url TEXT UNIQUE, url_destiny TEXT, title TEXT, error TEXT, description TEXT, description_links INTEGER DEFAULT 0, json TEXT, json_ai TEXT, ai_processed INTEGER DEFAULT 0, history INTEGER DEFAULT 0, last_touch DATETIME, created_at DATETIME)"
     )
     c.execute(
         "CREATE TABLE IF NOT EXISTS ai_log (id INTEGER PRIMARY KEY, instructions STRING, response STRING, model STRING, prompt_file STRING, prompt_name STRING, created_at DATETIME)"
@@ -38,27 +52,25 @@ def create_tables():
         "CREATE TABLE IF NOT EXISTS urls_valid_prefix (id INTEGER PRIMARY KEY, url_prefix TEXT UNIQUE, url_type TEXT)"
     )
-    return pd.read_sql_query("SELECT * FROM urls LIMIT 100", conn)
+def update_db():
+    try:
+        c = conn.cursor()
+        c.execute("ALTER TABLE urls RENAME COLUMN h1 TO title")
+    except:
+        pass
-# TODO: not sure this should be something. depends on the project
-def seeds():
-    create_tables()
-    add_urls_valid_prefix("https://%.linkedin.com/posts/%", "linkedin_post")
-    add_urls_valid_prefix("https://lnkd.in/%", "linkedin_redirect")
-    add_urls_valid_prefix("https://%.linkedin.com/jobs/view/%", "linkedin_job")
-    add_urls_valid_prefix("https://%.linkedin.com/feed/%", "linkedin_feed")
-    add_urls_valid_prefix("https://%.linkedin.com/company/%", "linkedin_company")
+def seeds(seeds={}):
-    # add_urls_valid_prefix("%.pdf", "pdf")
-    # add_url('https://imazon.org.br/categorias/artigos-cientificos/')
+    for url_type, url_prefix in seeds.items():
+        add_urls_valid_prefix(url_prefix, url_type)
     return True
+@use_connection
 def add_urls_valid_prefix(url_prefix, url_type):
-    conn = get_db_connection()
     df = pd.read_sql_query(
         f"SELECT * FROM urls_valid_prefix WHERE url_prefix = '{url_prefix}'", conn
@@ -72,6 +84,7 @@ def add_urls_valid_prefix(url_prefix, url_type):
         conn.commit()
+@use_connection
 def get_urls_valid_prefix_by_type(url_type):
     df = pd.read_sql_query(
         f"SELECT * FROM urls_valid_prefix WHERE url_type = '{url_type}'", conn
@@ -79,12 +92,14 @@ def get_urls_valid_prefix_by_type(url_type):
     return df
+@use_connection
 def get_urls_valid_prefix_by_id(id):
     df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix WHERE id = '{id}'", conn)
     return df
 # TODO: pagination required
+@use_connection
 def get_urls_valid_prefix(limit=0):
     if limit > 0:
         df = pd.read_sql_query(f"SELECT * FROM urls_valid_prefix LIMIT {limit}", conn)
@@ -94,6 +109,7 @@ def get_urls_valid_prefix(limit=0):
 # TODO: pagination required
+@use_connection
 def get_urls(limit=0):
     if limit > 0:
         df = pd.read_sql_query(
@@ -104,6 +120,7 @@ def get_urls(limit=0):
     return df
+@use_connection
 def get_urls_report():
     sql = """
     WITH parent_url AS (
@@ -113,7 +130,7 @@ def get_urls_report():
         SELECT
             u.id,
             u.url,
-            u.h1
+            u.title
             FROM urls u
                 INNER JOIN parent_url p
                     ON u.url = p.parent_url
@@ -122,9 +139,9 @@ def get_urls_report():
         u.id,
         u.url_type,
         u.url,
-        COALESCE(u.h1, p.h1) as h1,
+        COALESCE(u.title, p.title) as title,
         p.url as parent_url,
-        p.h1 as parent_h1
+        p.title as parent_title
         FROM urls u
         LEFT JOIN parents p
             ON u.parent_url = p.url
@@ -138,6 +155,7 @@ def get_urls_report():
     return df
+@use_connection
 def get_url_by_url(url):
     url = clean_url(url)
     df = pd.read_sql_query(f"SELECT * FROM urls WHERE url = '{url}'", conn)
@@ -145,12 +163,14 @@ def get_url_by_url(url):
     return df
+@use_connection
 def get_url_by_id(id):
     df = pd.read_sql_query(f"SELECT * FROM urls WHERE id = '{id}'", conn)
     return df
+@use_connection
 def get_urls_by_url_type(url_type):
     df = pd.read_sql_query(
         f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}'", conn
@@ -158,6 +178,7 @@ def get_urls_by_url_type(url_type):
     return df
+@use_connection
 def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
     df = pd.read_sql_query(
         f"SELECT * FROM urls WHERE history = 0 AND url_type = '{url_type}' AND ai_processed = 0 LIMIT {limit}",
@@ -166,6 +187,7 @@ def get_urls_by_url_type_for_ai_process(url_type="linkedin_post", limit=10):
     return df
+@use_connection
 def get_url_like_unclassified(like_condition):
     df = pd.read_sql_query(
         f"SELECT * FROM urls WHERE history = 0 AND url LIKE '{like_condition}' AND url_type IS NULL",
@@ -174,12 +196,13 @@ def get_url_like_unclassified(like_condition):
     return df
-def add_url(url, h1=None, parent_url=None):
+@use_connection
+def add_url(url, title=None, parent_url=None):
     url = clean_url(url)
     c = conn.cursor()
-    if h1 is not None:
-        h1 = h1.strip()
+    if title is not None:
+        title = title.strip()
     if parent_url is None:
         parent_url = None
@@ -188,14 +211,15 @@ def add_url(url, h1=None, parent_url=None):
     if len(get_url_by_url(url)) == 0:
         c.execute(
-            "INSERT INTO urls (url, h1, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
-            (url, h1, parent_url, int(time.time())),
+            "INSERT INTO urls (url, title, parent_url, created_at, ai_processed, description_links, history) VALUES (?, ?, ?, ?, 0, 0, 0)",
+            (url, title, parent_url, int(time.time())),
         )
         conn.commit()
     return get_url_by_url(url)
+@use_connection
 def add_ai_log(instructions, response, model, prompt_file, prompt_name):
     c = conn.cursor()
@@ -205,10 +229,14 @@ def add_ai_log(instructions, response, model, prompt_file, prompt_name):
     )
     conn.commit()
+@use_connection
 def get_ai_log():
     df = pd.read_sql_query(f"SELECT * FROM ai_log", conn)
     return df
+@use_connection
 def set_url_destiny(url, destiny):
     url = clean_url(url)
     destiny = clean_url(destiny)
@@ -222,45 +250,62 @@ def set_url_destiny(url, destiny):
     conn.commit()
-def set_url_h1(url, value):
+@use_connection
+def set_url_title(url, value):
     value = str(value).strip()
     url = clean_url(url)
     c = conn.cursor()
-    c.execute("UPDATE urls SET h1 = ? WHERE url = ?", (value, url))
+    c.execute("UPDATE urls SET title = ? WHERE url = ?", (value, url))
     conn.commit()
-def set_url_h1_by_id(id, value):
+@use_connection
+def set_url_title_by_id(id, value):
     value = str(value).strip()
     c = conn.cursor()
-    c.execute("UPDATE urls SET h1 = ? WHERE id = ?", (value, id))
+    c.execute("UPDATE urls SET title = ? WHERE id = ?", (value, id))
     conn.commit()
+@use_connection
 def set_url_ai_processed_by_id(id, json_str):
     value = 1
     value = str(value).strip()
     c = conn.cursor()
-    c.execute("UPDATE urls SET ai_processed = ? , json_ai = ? WHERE id = ?", (value, json_str, id))
+    c.execute(
+        "UPDATE urls SET ai_processed = ? , json_ai = ? WHERE id = ?",
+        (value, json_str, id),
+    )
     conn.commit()
+@use_connection
 def set_url_empty_ai_processed_by_id(id, json_str="empty result"):
     value = 1
     value = str(value).strip()
     c = conn.cursor()
-    c.execute("UPDATE urls SET ai_processed = ? , json_ai = ? WHERE ai_processed = 0 AND id = ?", (value, json_str, id))
+    c.execute(
+        "UPDATE urls SET ai_processed = ? , json_ai = ? WHERE ai_processed = 0 AND id = ?",
+        (value, json_str, id),
+    )
     conn.commit()
+@use_connection
 def set_url_ai_processed_by_url(url, json_str):
     value = 1
     value = str(value).strip()
     url = clean_url(url)
     c = conn.cursor()
-    c.execute("UPDATE urls SET ai_processed = ?, json_ai = ? WHERE url = ?", (value, json_str, url))
+    c.execute(
+        "UPDATE urls SET ai_processed = ?, json_ai = ? WHERE url = ?",
+        (value, json_str, url),
+    )
     conn.commit()
+@use_connection
 def set_url_description(url, value):
     url = clean_url(url)
     c = conn.cursor()
@@ -268,6 +313,7 @@ def set_url_description(url, value):
     conn.commit()
+@use_connection
 def set_url_description_links(url, value):
     url = clean_url(url)
     c = conn.cursor()
@@ -275,6 +321,7 @@ def set_url_description_links(url, value):
     conn.commit()
+@use_connection
 def set_url_json(url, value):
     url = clean_url(url)
     c = conn.cursor()
@@ -282,6 +329,7 @@ def set_url_json(url, value):
     conn.commit()
+@use_connection
 def set_url_error(url, value):
     url = clean_url(url)
     c = conn.cursor()
@@ -289,6 +337,7 @@ def set_url_error(url, value):
     conn.commit()
+@use_connection
 def set_url_type_by_id(url_id, url_type):
     c = conn.cursor()
     c.execute(f"UPDATE urls SET url_type = '{url_type}' WHERE id = {url_id}")
@@ -312,6 +361,7 @@ def clean_url(url):
     return url
+@use_connection
 def get_untouched_urls(
     limit=10, randomize=True, ignore_valid_prefix=False, only_parents=True
 ):
@@ -331,6 +381,7 @@ def get_untouched_urls(
     return df
+@use_connection
 def touch_url(url):
     url = clean_url(url)
     c = conn.cursor()
@@ -338,6 +389,7 @@ def touch_url(url):
     conn.commit()
+@use_connection
 def untouch_url(url):
     url = clean_url(url)
     c = conn.cursor()
@@ -345,12 +397,14 @@ def untouch_url(url):
     conn.commit()
+@use_connection
 def untouch_all_urls():
     c = conn.cursor()
     c.execute("UPDATE urls SET last_touch = NULL WHERE history = 0")
     conn.commit()
+@use_connection
 def set_all_urls_as_history():
     c = conn.cursor()
     c.execute("UPDATE urls SET history = 1")
@@ -382,19 +436,19 @@ def merge_dbs() -> None:
                 row["description"],
                 row["json"],
             )
-        # ßmerge_url(df)
-def merge_url(url, h1, last_touch, created_at, description, json):
+@use_connection
+def merge_url(url, title, last_touch, created_at, description, json):
     url = clean_url(url)
     c = conn.cursor()
-    if h1 is not None:
-        h1 = h1.strip()
+    if title is not None:
+        title = title.strip()
     if len(get_url_by_url(url)) == 0:
         c.execute(
-            "INSERT INTO urls (url, h1, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
-            (url, h1, last_touch, created_at, description, json),
+            "INSERT INTO urls (url, title, last_touch , created_at, history, ai_processed, description_links, description, json) VALUES (?, ?, ?, ?, 1, 0, 0, ? , ?)",
+            (url, title, last_touch, created_at, description, json),
         )
         conn.commit()

ohmyscrapper/modules/classify_urls.py CHANGED Viewed

@@ -1,23 +1,31 @@
 import ohmyscrapper.models.urls_manager as urls_manager
+from ohmyscrapper.modules import seed
 import pandas as pd
 import time
 def classify_urls(recursive=False):
-    urls_manager.seeds()
     df = urls_manager.get_urls_valid_prefix()
+    if len(df) == 0:
+        seed.seed()
+        classify_urls(recursive=recursive)
+        return
     keep_alive = True
     while keep_alive:
-        print("waking up!")
+        print("#️⃣  URL Classifier woke up to classify urls!")
         for index, row_prefix in df.iterrows():
-            df_urls = urls_manager.get_url_like_unclassified(like_condition=row_prefix["url_prefix"])
+            df_urls = urls_manager.get_url_like_unclassified(
+                like_condition=row_prefix["url_prefix"]
+            )
             for index, row_urls in df_urls.iterrows():
-                urls_manager.set_url_type_by_id(url_id =row_urls["id"], url_type=row_prefix["url_type"])
+                urls_manager.set_url_type_by_id(
+                    url_id=row_urls["id"], url_type=row_prefix["url_type"]
+                )
         if not recursive:
-            print("ending...")
+            print("#️⃣  URL Classifier said: I'm done! See you soon...")
             keep_alive = False
         else:
-            print("sleeping...")
+            print("#️⃣  URL Classifier is taking a nap...")
             time.sleep(10)

ohmyscrapper/modules/load_txt.py CHANGED Viewed

@@ -1,31 +1,99 @@
 import os
 from urlextract import URLExtract
 import ohmyscrapper.models.urls_manager as urls_manager
+from ohmyscrapper.core import config
-def load_txt(file_name="input/_chat.txt"):
+def _increment_file_name(text_file_content, file_name):
+    print(f"reading and loading file `{file_name}`... ")
+    with open(file_name, "r") as f:
+        return text_file_content + f.read()
-    if not os.path.exists("input"):
-        os.mkdir("input")
-    urls_manager.create_tables()
+def load_txt(file_name="input", verbose=False):
+    input_folder = config.get_dir("input")
+    if not os.path.exists(input_folder):
+        os.mkdir(input_folder)
     urls_manager.seeds()
-    # make it recursive for all files
-    text_file_content = open(file_name, "r").read()
-    put_urls_from_string(text_to_process=text_file_content)
+    text_file_content = ""
+    if file_name is not None and not os.path.isdir(file_name):
+        print(f"📖 reading file `{file_name}`... ")
+        if not os.path.exists(file_name):
+            if file_name.startswith("https://") or file_name.startswith("http://"):
+                text_file_content = " " + file_name + " "
+            else:
+                print(f"\n file `{file_name}` not found.")
+                return
+        else:
+            text_file_content = _increment_file_name(
+                text_file_content=text_file_content, file_name=file_name
+            )
+    else:
+        input_folder = config.get_dir("input")
+        print(f"📂 reading {input_folder} directory... ")
+        if file_name is None:
+            dir_files = input_folder
+        else:
+            dir_files = file_name
+        text_files = os.listdir(dir_files)
+        for file in text_files:
+            if not file.endswith(".txt"):
+                text_files.remove(file)
+        if len(text_files) == 0:
+            print(f"No text files found in {input_folder} directory!")
+            return
+        elif len(text_files) == 1:
+            print(f"📖 reading file `{dir_files}/{text_files[0]}`... ")
+            text_file_content = _increment_file_name(
+                text_file_content=text_file_content,
+                file_name=os.path.join(dir_files, text_files[0]),
+            )
+        else:
+            print("\nChoose a text file. Use `*` for process all and `q` to quit:")
+            for index, file in enumerate(text_files):
+                print(f"[{index}]:", os.path.join(dir_files, file))
+            text_file_option = -1
+            while text_file_option < 0 or text_file_option >= len(text_files):
+                text_file_option = input("Enter the file number: ")
+                if text_file_option == "*":
+                    for file in text_files:
+                        text_file_content = _increment_file_name(
+                            text_file_content=text_file_content,
+                            file_name=os.path.join(dir_files, file),
+                        )
+                        text_file_option = 0
+                elif text_file_option == "q":
+                    return
+                elif text_file_option.isdigit():
+                    text_file_option = int(text_file_option)
+                    if text_file_option >= 0 and text_file_option < len(text_files):
+                        text_file_content = _increment_file_name(
+                            text_file_content=text_file_content,
+                            file_name=os.path.join(
+                                dir_files, text_files[int(text_file_option)]
+                            ),
+                        )
+    print("🔎 looking for urls...")
+    urls_found = put_urls_from_string(
+        text_to_process=text_file_content, verbose=verbose
+    )
-    # move_it_to_processed
     print("--------------------")
-    print(file_name, "processed")
+    print("files processed")
+    print(f"📦 {urls_found} urls were extracted and packed into the database")
-def put_urls_from_string(text_to_process, parent_url=None):
+def put_urls_from_string(text_to_process, parent_url=None, verbose=False):
     if isinstance(text_to_process, str):
         extractor = URLExtract()
         for url in extractor.find_urls(text_to_process):
             urls_manager.add_url(url=url, parent_url=parent_url)
-            print(url, "added")
+            if verbose:
+                print(url, "added")
         return len(extractor.find_urls(text_to_process))
     else:

ohmyscrapper 0.2.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

ohmyscrapper 0.2.3py3-none-any.whl → 0.7.0py3-none-any.whl