scrapping-cli 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ Metadata-Version: 2.4
2
+ Name: scrapping-cli
3
+ Version: 1.0.0
File without changes
@@ -0,0 +1,19 @@
1
+ import sys
2
+ from mycli.commands import startproject
3
+
4
+ COMMANDS = {
5
+ "startproject": startproject.run,
6
+ }
7
+
8
+ def main():
9
+ if len(sys.argv) < 3:
10
+ print("Usage: mycli <command> <project_name>")
11
+ return
12
+
13
+ command = sys.argv[1]
14
+ args = sys.argv[2:]
15
+
16
+ if command in COMMANDS:
17
+ COMMANDS[command](*args)
18
+ else:
19
+ print(f"Unknown command: {command}")
File without changes
@@ -0,0 +1,19 @@
1
+ import os
2
+ import shutil
3
+ import pkg_resources
4
+
5
+ def run(project_name):
6
+ template_dir = pkg_resources.resource_filename("mycli", "template")
7
+ target_dir = os.path.abspath(project_name)
8
+
9
+ if os.path.exists(target_dir):
10
+ print(f"Folder '{project_name}' already exists!")
11
+ return
12
+
13
+ shutil.copytree(
14
+ template_dir,
15
+ target_dir,
16
+ ignore=shutil.ignore_patterns("__pycache__", "*.pyc", ".DS_Store")
17
+ )
18
+
19
+ print(f"Project '{project_name}' created successfully!")
@@ -0,0 +1,23 @@
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ def get_env(key, default=None, required=False):
7
+ value = os.getenv(key, default)
8
+ if required and value is None:
9
+ raise ValueError(f"Missing required env variable: {key}")
10
+ return value
11
+
12
+ DB_CONFIG = {
13
+ "host": get_env("DB_HOST", required=True),
14
+ "user": get_env("DB_USER", required=True),
15
+ "password": get_env("DB_PASSWORD", required=True),
16
+ "database": get_env("DB_NAME", required=True),
17
+ }
18
+
19
+ FOLDER_PATH = get_env("INPUT_FOLDER", required=True)
20
+ OUTPUT_FOLDER_PATH = get_env("OUTPUT_FOLDER", "./output")
21
+
22
+ MAX_WORKERS = int(get_env("MAX_WORKERS", 5))
23
+ BATCH_SIZE = int(get_env("BATCH_SIZE", 500))
@@ -0,0 +1,37 @@
1
+ import mysql.connector
2
+ from config import DB_CONFIG
3
+ from logger import setup_logger
4
+
5
+ logger = setup_logger()
6
+
7
+ def get_connection():
8
+ return mysql.connector.connect(
9
+ host=DB_CONFIG["host"],
10
+ user=DB_CONFIG["user"],
11
+ password=DB_CONFIG["password"],
12
+ )
13
+
14
+ def get_connection_thread():
15
+ return mysql.connector.connect(**DB_CONFIG)
16
+
17
+ def create_database(cursor):
18
+ db_name = DB_CONFIG["database"]
19
+ cursor.execute(f"CREATE DATABASE IF NOT EXISTS {db_name}")
20
+ cursor.execute(f"USE {db_name}")
21
+ logger.info(f"Database ready: {db_name}")
22
+
23
+ def create_table(cursor):
24
+ cursor.execute("""
25
+ CREATE TABLE IF NOT EXISTS table_name(
26
+ id INT AUTO_INCREMENT PRIMARY KEY
27
+ )
28
+ """)
29
+ logger.info("Table created")
30
+
31
+ def insert_multiple_data(cursor, data):
32
+ if not data:
33
+ return
34
+
35
+ query = "INSERT INTO table_name() VALUES ()"
36
+ cursor.executemany(query)
37
+ logger.info(f"Inserted {len(data)} records")
@@ -0,0 +1,7 @@
1
+ import logging
2
+
3
+ def setup_logging():
4
+ logging.basicConfig(
5
+ level=logging.INFO,
6
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
7
+ )
@@ -0,0 +1,5 @@
1
+ from pydantic import BaseModel, field_validator
2
+ import re
3
+ from logger import setup_logger
4
+
5
+ logger = setup_logger()
@@ -0,0 +1,49 @@
1
+ import json
2
+ import gzip
3
+ import os
4
+ from logger import setup_logger
5
+
6
+ logger = setup_logger()
7
+
8
+ def load_file(file_path):
9
+ name, ext = os.path.splitext(file_path)
10
+
11
+ try:
12
+ if ext == ".gz":
13
+ with gzip.open(file_path, "rt", encoding="utf-8") as f:
14
+ return json.load(f)
15
+
16
+ elif ext == ".json":
17
+ with open(file_path, "r", encoding="utf-8") as f:
18
+ return json.load(f)
19
+
20
+ else:
21
+ logger.warning(f"Unsupported file: {file_path}")
22
+ return None
23
+
24
+ except Exception as e:
25
+ logger.error(f"File read error: {file_path} | {e}")
26
+ return None
27
+
28
+
29
+ # MAIN PARSER ENTRY
30
+ def parse_file(file_path):
31
+ raw = load_file(file_path)
32
+ if not raw:
33
+ return None
34
+
35
+ try:
36
+ return transform(raw)
37
+ except Exception as e:
38
+ logger.error(f"Parse error: {file_path} | {e}")
39
+ return None
40
+
41
+ def transform(raw):
42
+ """
43
+ User writes extraction logic here
44
+ Supports:
45
+ - JSON
46
+ - lxml
47
+ - parsel
48
+ """
49
+ return raw
@@ -0,0 +1,3 @@
1
+ python-dotenv
2
+ mysql-connector-python
3
+ pydantic
@@ -0,0 +1,31 @@
1
+ import os
2
+ from parser import parse_file
3
+ from db import FOLDER_PATH, get_connection, create_table, insert_data, create_database
4
+ from logger import setup_logger
5
+
6
+ logger = setup_logger()
7
+ def main():
8
+ conn = get_connection()
9
+ cursor = conn.cursor()
10
+
11
+ create_database(cursor)
12
+ create_table(cursor)
13
+
14
+ for file_name in os.listdir(FOLDER_PATH):
15
+ file_path = os.path.join(FOLDER_PATH, file_name)
16
+ print(f"Processing: {file_name}")
17
+
18
+ try:
19
+ data = parse_file(file_path)
20
+ insert_data(cursor,data)
21
+ conn.commit()
22
+
23
+ except Exception as e:
24
+ print(f"Error in {file_name}: {e}")
25
+
26
+ cursor.close()
27
+ conn.close()
28
+ print("Done!")
29
+
30
+ if __name__ == "__main__":
31
+ main()
@@ -0,0 +1,60 @@
1
+ import os
2
+ import time
3
+
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+
6
+ from parser import parse_file
7
+ from db import get_connection, get_connection_thread, create_table, create_database, insert_multiple_data
8
+ from config import FOLDER_PATH, BATCH_SIZE, MAX_WORKERS
9
+ from logger import setup_logger
10
+
11
+ logger = setup_logger()
12
+
13
+ def insert_batch(batch):
14
+ conn = get_connection_thread()
15
+ cursor = conn.cursor()
16
+
17
+ insert_multiple_data(cursor, batch)
18
+ conn.commit()
19
+
20
+ cursor.close()
21
+ conn.close()
22
+
23
+ def main():
24
+ start = time.time()
25
+
26
+ conn = get_connection()
27
+ cursor = conn.cursor()
28
+ create_database(cursor)
29
+ create_table(cursor)
30
+ conn.commit()
31
+ cursor.close()
32
+ conn.close()
33
+
34
+ batch = []
35
+ futures = []
36
+
37
+ with ThreadPoolExecutor(MAX_WORKERS) as parser_pool:
38
+ tasks = {
39
+ parser_pool.submit(parse_file, os.path.join(FOLDER_PATH, f)): f
40
+ for f in os.listdir(FOLDER_PATH)
41
+ }
42
+
43
+ with ThreadPoolExecutor(MAX_WORKERS) as db_pool:
44
+ for future in as_completed(tasks):
45
+ result = future.result()
46
+
47
+ if result:
48
+ batch.append(result)
49
+
50
+ if len(batch) >= BATCH_SIZE:
51
+ futures.append(db_pool.submit(insert_batch, batch.copy()))
52
+ batch.clear()
53
+
54
+ if batch:
55
+ futures.append(db_pool.submit(insert_batch, batch.copy()))
56
+
57
+ for f in futures:
58
+ f.result()
59
+
60
+ logger.info(f"Runtime: {time.time() - start}")
@@ -0,0 +1,3 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,3 @@
1
+ Metadata-Version: 2.4
2
+ Name: scrapping-cli
3
+ Version: 1.0.0
@@ -0,0 +1,19 @@
1
+ pyproject.toml
2
+ setup.py
3
+ mycli/__init__.py
4
+ mycli/cli.py
5
+ mycli/commands/__init__.py
6
+ mycli/commands/startproject.py
7
+ mycli/template/config.py
8
+ mycli/template/db.py
9
+ mycli/template/logger.py
10
+ mycli/template/model.py
11
+ mycli/template/parser.py
12
+ mycli/template/requirements.txt
13
+ mycli/template/simple_main.py
14
+ mycli/template/thread_main.py
15
+ scrapping_cli.egg-info/PKG-INFO
16
+ scrapping_cli.egg-info/SOURCES.txt
17
+ scrapping_cli.egg-info/dependency_links.txt
18
+ scrapping_cli.egg-info/entry_points.txt
19
+ scrapping_cli.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ scrapping-cli = mycli.cli:main
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,16 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="scrapping-cli",
5
+ version="1.0.0",
6
+ packages=find_packages(),
7
+ include_package_data=True,
8
+ package_data={
9
+ "mycli": ["template/**/*"],
10
+ },
11
+ entry_points={
12
+ 'console_scripts': [
13
+ 'scrapping-cli=mycli.cli:main',
14
+ ],
15
+ },
16
+ )