scrapping-cli 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapping_cli-1.0.0/PKG-INFO +3 -0
- scrapping_cli-1.0.0/mycli/__init__.py +0 -0
- scrapping_cli-1.0.0/mycli/cli.py +19 -0
- scrapping_cli-1.0.0/mycli/commands/__init__.py +0 -0
- scrapping_cli-1.0.0/mycli/commands/startproject.py +19 -0
- scrapping_cli-1.0.0/mycli/template/config.py +23 -0
- scrapping_cli-1.0.0/mycli/template/db.py +37 -0
- scrapping_cli-1.0.0/mycli/template/logger.py +7 -0
- scrapping_cli-1.0.0/mycli/template/model.py +5 -0
- scrapping_cli-1.0.0/mycli/template/parser.py +49 -0
- scrapping_cli-1.0.0/mycli/template/requirements.txt +3 -0
- scrapping_cli-1.0.0/mycli/template/simple_main.py +31 -0
- scrapping_cli-1.0.0/mycli/template/thread_main.py +60 -0
- scrapping_cli-1.0.0/pyproject.toml +3 -0
- scrapping_cli-1.0.0/scrapping_cli.egg-info/PKG-INFO +3 -0
- scrapping_cli-1.0.0/scrapping_cli.egg-info/SOURCES.txt +19 -0
- scrapping_cli-1.0.0/scrapping_cli.egg-info/dependency_links.txt +1 -0
- scrapping_cli-1.0.0/scrapping_cli.egg-info/entry_points.txt +2 -0
- scrapping_cli-1.0.0/scrapping_cli.egg-info/top_level.txt +1 -0
- scrapping_cli-1.0.0/setup.cfg +4 -0
- scrapping_cli-1.0.0/setup.py +16 -0
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from mycli.commands import startproject
|
|
3
|
+
|
|
4
|
+
COMMANDS = {
|
|
5
|
+
"startproject": startproject.run,
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
def main():
|
|
9
|
+
if len(sys.argv) < 3:
|
|
10
|
+
print("Usage: mycli <command> <project_name>")
|
|
11
|
+
return
|
|
12
|
+
|
|
13
|
+
command = sys.argv[1]
|
|
14
|
+
args = sys.argv[2:]
|
|
15
|
+
|
|
16
|
+
if command in COMMANDS:
|
|
17
|
+
COMMANDS[command](*args)
|
|
18
|
+
else:
|
|
19
|
+
print(f"Unknown command: {command}")
|
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import pkg_resources
|
|
4
|
+
|
|
5
|
+
def run(project_name):
|
|
6
|
+
template_dir = pkg_resources.resource_filename("mycli", "template")
|
|
7
|
+
target_dir = os.path.abspath(project_name)
|
|
8
|
+
|
|
9
|
+
if os.path.exists(target_dir):
|
|
10
|
+
print(f"Folder '{project_name}' already exists!")
|
|
11
|
+
return
|
|
12
|
+
|
|
13
|
+
shutil.copytree(
|
|
14
|
+
template_dir,
|
|
15
|
+
target_dir,
|
|
16
|
+
ignore=shutil.ignore_patterns("__pycache__", "*.pyc", ".DS_Store")
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
print(f"Project '{project_name}' created successfully!")
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dotenv import load_dotenv
|
|
3
|
+
|
|
4
|
+
load_dotenv()
|
|
5
|
+
|
|
6
|
+
def get_env(key, default=None, required=False):
|
|
7
|
+
value = os.getenv(key, default)
|
|
8
|
+
if required and value is None:
|
|
9
|
+
raise ValueError(f"Missing required env variable: {key}")
|
|
10
|
+
return value
|
|
11
|
+
|
|
12
|
+
DB_CONFIG = {
|
|
13
|
+
"host": get_env("DB_HOST", required=True),
|
|
14
|
+
"user": get_env("DB_USER", required=True),
|
|
15
|
+
"password": get_env("DB_PASSWORD", required=True),
|
|
16
|
+
"database": get_env("DB_NAME", required=True),
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
FOLDER_PATH = get_env("INPUT_FOLDER", required=True)
|
|
20
|
+
OUTPUT_FOLDER_PATH = get_env("OUTPUT_FOLDER", "./output")
|
|
21
|
+
|
|
22
|
+
MAX_WORKERS = int(get_env("MAX_WORKERS", 5))
|
|
23
|
+
BATCH_SIZE = int(get_env("BATCH_SIZE", 500))
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import mysql.connector
|
|
2
|
+
from config import DB_CONFIG
|
|
3
|
+
from logger import setup_logger
|
|
4
|
+
|
|
5
|
+
logger = setup_logger()
|
|
6
|
+
|
|
7
|
+
def get_connection():
|
|
8
|
+
return mysql.connector.connect(
|
|
9
|
+
host=DB_CONFIG["host"],
|
|
10
|
+
user=DB_CONFIG["user"],
|
|
11
|
+
password=DB_CONFIG["password"],
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
def get_connection_thread():
|
|
15
|
+
return mysql.connector.connect(**DB_CONFIG)
|
|
16
|
+
|
|
17
|
+
def create_database(cursor):
|
|
18
|
+
db_name = DB_CONFIG["database"]
|
|
19
|
+
cursor.execute(f"CREATE DATABASE IF NOT EXISTS {db_name}")
|
|
20
|
+
cursor.execute(f"USE {db_name}")
|
|
21
|
+
logger.info(f"Database ready: {db_name}")
|
|
22
|
+
|
|
23
|
+
def create_table(cursor):
|
|
24
|
+
cursor.execute("""
|
|
25
|
+
CREATE TABLE IF NOT EXISTS table_name(
|
|
26
|
+
id INT AUTO_INCREMENT PRIMARY KEY
|
|
27
|
+
)
|
|
28
|
+
""")
|
|
29
|
+
logger.info("Table created")
|
|
30
|
+
|
|
31
|
+
def insert_multiple_data(cursor, data):
|
|
32
|
+
if not data:
|
|
33
|
+
return
|
|
34
|
+
|
|
35
|
+
query = "INSERT INTO table_name() VALUES ()"
|
|
36
|
+
cursor.executemany(query)
|
|
37
|
+
logger.info(f"Inserted {len(data)} records")
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import gzip
|
|
3
|
+
import os
|
|
4
|
+
from logger import setup_logger
|
|
5
|
+
|
|
6
|
+
logger = setup_logger()
|
|
7
|
+
|
|
8
|
+
def load_file(file_path):
|
|
9
|
+
name, ext = os.path.splitext(file_path)
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
if ext == ".gz":
|
|
13
|
+
with gzip.open(file_path, "rt", encoding="utf-8") as f:
|
|
14
|
+
return json.load(f)
|
|
15
|
+
|
|
16
|
+
elif ext == ".json":
|
|
17
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
18
|
+
return json.load(f)
|
|
19
|
+
|
|
20
|
+
else:
|
|
21
|
+
logger.warning(f"Unsupported file: {file_path}")
|
|
22
|
+
return None
|
|
23
|
+
|
|
24
|
+
except Exception as e:
|
|
25
|
+
logger.error(f"File read error: {file_path} | {e}")
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# MAIN PARSER ENTRY
|
|
30
|
+
def parse_file(file_path):
|
|
31
|
+
raw = load_file(file_path)
|
|
32
|
+
if not raw:
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
return transform(raw)
|
|
37
|
+
except Exception as e:
|
|
38
|
+
logger.error(f"Parse error: {file_path} | {e}")
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
def transform(raw):
|
|
42
|
+
"""
|
|
43
|
+
User writes extraction logic here
|
|
44
|
+
Supports:
|
|
45
|
+
- JSON
|
|
46
|
+
- lxml
|
|
47
|
+
- parsel
|
|
48
|
+
"""
|
|
49
|
+
return raw
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from parser import parse_file
|
|
3
|
+
from db import FOLDER_PATH, get_connection, create_table, insert_data, create_database
|
|
4
|
+
from logger import setup_logger
|
|
5
|
+
|
|
6
|
+
logger = setup_logger()
|
|
7
|
+
def main():
|
|
8
|
+
conn = get_connection()
|
|
9
|
+
cursor = conn.cursor()
|
|
10
|
+
|
|
11
|
+
create_database(cursor)
|
|
12
|
+
create_table(cursor)
|
|
13
|
+
|
|
14
|
+
for file_name in os.listdir(FOLDER_PATH):
|
|
15
|
+
file_path = os.path.join(FOLDER_PATH, file_name)
|
|
16
|
+
print(f"Processing: {file_name}")
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
data = parse_file(file_path)
|
|
20
|
+
insert_data(cursor,data)
|
|
21
|
+
conn.commit()
|
|
22
|
+
|
|
23
|
+
except Exception as e:
|
|
24
|
+
print(f"Error in {file_name}: {e}")
|
|
25
|
+
|
|
26
|
+
cursor.close()
|
|
27
|
+
conn.close()
|
|
28
|
+
print("Done!")
|
|
29
|
+
|
|
30
|
+
if __name__ == "__main__":
|
|
31
|
+
main()
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
|
+
|
|
6
|
+
from parser import parse_file
|
|
7
|
+
from db import get_connection, get_connection_thread, create_table, create_database, insert_multiple_data
|
|
8
|
+
from config import FOLDER_PATH, BATCH_SIZE, MAX_WORKERS
|
|
9
|
+
from logger import setup_logger
|
|
10
|
+
|
|
11
|
+
logger = setup_logger()
|
|
12
|
+
|
|
13
|
+
def insert_batch(batch):
|
|
14
|
+
conn = get_connection_thread()
|
|
15
|
+
cursor = conn.cursor()
|
|
16
|
+
|
|
17
|
+
insert_multiple_data(cursor, batch)
|
|
18
|
+
conn.commit()
|
|
19
|
+
|
|
20
|
+
cursor.close()
|
|
21
|
+
conn.close()
|
|
22
|
+
|
|
23
|
+
def main():
|
|
24
|
+
start = time.time()
|
|
25
|
+
|
|
26
|
+
conn = get_connection()
|
|
27
|
+
cursor = conn.cursor()
|
|
28
|
+
create_database(cursor)
|
|
29
|
+
create_table(cursor)
|
|
30
|
+
conn.commit()
|
|
31
|
+
cursor.close()
|
|
32
|
+
conn.close()
|
|
33
|
+
|
|
34
|
+
batch = []
|
|
35
|
+
futures = []
|
|
36
|
+
|
|
37
|
+
with ThreadPoolExecutor(MAX_WORKERS) as parser_pool:
|
|
38
|
+
tasks = {
|
|
39
|
+
parser_pool.submit(parse_file, os.path.join(FOLDER_PATH, f)): f
|
|
40
|
+
for f in os.listdir(FOLDER_PATH)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
with ThreadPoolExecutor(MAX_WORKERS) as db_pool:
|
|
44
|
+
for future in as_completed(tasks):
|
|
45
|
+
result = future.result()
|
|
46
|
+
|
|
47
|
+
if result:
|
|
48
|
+
batch.append(result)
|
|
49
|
+
|
|
50
|
+
if len(batch) >= BATCH_SIZE:
|
|
51
|
+
futures.append(db_pool.submit(insert_batch, batch.copy()))
|
|
52
|
+
batch.clear()
|
|
53
|
+
|
|
54
|
+
if batch:
|
|
55
|
+
futures.append(db_pool.submit(insert_batch, batch.copy()))
|
|
56
|
+
|
|
57
|
+
for f in futures:
|
|
58
|
+
f.result()
|
|
59
|
+
|
|
60
|
+
logger.info(f"Runtime: {time.time() - start}")
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
setup.py
|
|
3
|
+
mycli/__init__.py
|
|
4
|
+
mycli/cli.py
|
|
5
|
+
mycli/commands/__init__.py
|
|
6
|
+
mycli/commands/startproject.py
|
|
7
|
+
mycli/template/config.py
|
|
8
|
+
mycli/template/db.py
|
|
9
|
+
mycli/template/logger.py
|
|
10
|
+
mycli/template/model.py
|
|
11
|
+
mycli/template/parser.py
|
|
12
|
+
mycli/template/requirements.txt
|
|
13
|
+
mycli/template/simple_main.py
|
|
14
|
+
mycli/template/thread_main.py
|
|
15
|
+
scrapping_cli.egg-info/PKG-INFO
|
|
16
|
+
scrapping_cli.egg-info/SOURCES.txt
|
|
17
|
+
scrapping_cli.egg-info/dependency_links.txt
|
|
18
|
+
scrapping_cli.egg-info/entry_points.txt
|
|
19
|
+
scrapping_cli.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
mycli
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="scrapping-cli",
|
|
5
|
+
version="1.0.0",
|
|
6
|
+
packages=find_packages(),
|
|
7
|
+
include_package_data=True,
|
|
8
|
+
package_data={
|
|
9
|
+
"mycli": ["template/**/*"],
|
|
10
|
+
},
|
|
11
|
+
entry_points={
|
|
12
|
+
'console_scripts': [
|
|
13
|
+
'scrapping-cli=mycli.cli:main',
|
|
14
|
+
],
|
|
15
|
+
},
|
|
16
|
+
)
|