junshan-kit 2.1.6__py2.py3-none-any.whl → 2.1.8__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- junshan_kit/DataProcessor.py +62 -2
- junshan_kit/datahub.py +89 -13
- junshan_kit/test.py +2 -4
- junshan_kit-2.1.8.dist-info/METADATA +11 -0
- junshan_kit-2.1.8.dist-info/RECORD +7 -0
- junshan_kit-2.1.6.dist-info/METADATA +0 -31
- junshan_kit-2.1.6.dist-info/RECORD +0 -7
- {junshan_kit-2.1.6.dist-info → junshan_kit-2.1.8.dist-info}/WHEEL +0 -0
junshan_kit/DataProcessor.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
|
2
2
|
import pandas as pd
|
3
3
|
import os
|
4
|
-
|
4
|
+
from sklearn.preprocessing import StandardScaler
|
5
5
|
import junshan_kit.datahub
|
6
6
|
|
7
7
|
class CSVToPandas:
|
@@ -16,6 +16,7 @@ class CSVToPandas:
|
|
16
16
|
|
17
17
|
# ----------------- ccfd_kaggle ----------------------------------
|
18
18
|
def ccfd_kaggle(self, data_name = 'ccfd-kaggle', show_info = True):
|
19
|
+
# download data if not exist
|
19
20
|
self.read_csv(data_name)
|
20
21
|
|
21
22
|
df = pd.read_csv(self.csv_path)
|
@@ -33,13 +34,72 @@ class CSVToPandas:
|
|
33
34
|
print('='*60)
|
34
35
|
print(f"{'Original size:':<25} {m_before} rows x {n_before} cols")
|
35
36
|
print(f"{'Size after dropping NaNs:':<25} {m_after} rows x {n_after} cols")
|
36
|
-
print(f"{'Export size:':<25} {m_after} rows x {n_after} cols")
|
37
37
|
print(f"{'Positive samples (+1):':<25} {pos_count}")
|
38
38
|
print(f"{'Negative samples (-1):':<25} {neg_count}")
|
39
|
+
print(f"{'Export size:':<25} {m_after} rows x {n_after} cols")
|
39
40
|
print('-'*60)
|
40
41
|
print(f"More details: https://www.jianguoyun.com/p/Dd1clVgQ4ZThCxiwzZQGIAA")
|
41
42
|
print('='*60 + '\n')
|
42
43
|
|
43
44
|
return df
|
45
|
+
|
46
|
+
# ------------------------
|
47
|
+
def ghpdd_kaggle(self, data_name='ghpdd-kaggle', show_info=True):
|
48
|
+
# download data if not exist
|
49
|
+
self.read_csv(data_name)
|
50
|
+
|
51
|
+
# read csv
|
52
|
+
df = pd.read_csv(self.csv_path)
|
53
|
+
m_before, n_before = df.shape
|
54
|
+
|
55
|
+
# drop NaNs
|
56
|
+
df = df.dropna(axis=0, how='any')
|
57
|
+
m_after, n_after = df.shape
|
58
|
+
|
59
|
+
# drop unique identifier
|
60
|
+
if 'property_id' in df.columns:
|
61
|
+
df.drop(columns=['property_id'], inplace=True)
|
62
|
+
|
63
|
+
# Replace label 0 with -1
|
64
|
+
df['decision'] = df['decision'].replace(0, -1)
|
65
|
+
|
66
|
+
# Identify categorical and numerical columns
|
67
|
+
cat_cols = ['country', 'city', 'property_type', 'furnishing_status']
|
68
|
+
num_cols = [col for col in df.columns if col not in cat_cols + ['decision']]
|
69
|
+
|
70
|
+
# One-Hot encode categorical columns
|
71
|
+
df = pd.get_dummies(df, columns=cat_cols)
|
72
|
+
|
73
|
+
# Convert boolean columns to int
|
74
|
+
bool_cols = df.select_dtypes(include='bool').columns
|
75
|
+
for col in bool_cols:
|
76
|
+
df[col] = df[col].astype(int)
|
77
|
+
|
78
|
+
# Standardize numerical columns
|
79
|
+
scaler = StandardScaler()
|
80
|
+
df[num_cols] = scaler.fit_transform(df[num_cols])
|
81
|
+
|
82
|
+
# 导出后的大小
|
83
|
+
m_export, n_export = df.shape
|
84
|
+
|
85
|
+
if show_info:
|
86
|
+
pos_count = (df['decision'] == 1).sum()
|
87
|
+
neg_count = (df['decision'] == -1).sum()
|
88
|
+
|
89
|
+
print('\n' + '='*70)
|
90
|
+
print(f"{'GHPDD-Kaggle Dataset Info':^70}")
|
91
|
+
print('='*70)
|
92
|
+
print(f"{'Original size:':<35} {m_before} rows x {n_before} cols")
|
93
|
+
print(f"{'Size after dropping NaNs:':<35} {m_after} rows x {n_after} cols")
|
94
|
+
print(f"{'Positive samples (+1):':<35} {pos_count}")
|
95
|
+
print(f"{'Negative samples (-1):':<35} {neg_count}")
|
96
|
+
print(f"{'Export size (after encoding & scaling):':<35} {m_export} rows x {n_export} cols")
|
97
|
+
print('-'*70)
|
98
|
+
print(f"{'Note: categorical columns one-hot encoded, numerical standardized.'}")
|
99
|
+
print(f"More details: https://www.jianguoyun.com/p/DU6Lr9oQqdHDDRj5sI0GIAA")
|
100
|
+
print('='*70 + '\n')
|
101
|
+
|
102
|
+
return df
|
103
|
+
|
44
104
|
|
45
105
|
|
junshan_kit/datahub.py
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
import kagglehub
|
2
|
-
import os
|
2
|
+
import os, time
|
3
3
|
import warnings
|
4
4
|
import shutil
|
5
5
|
from kaggle.api.kaggle_api_extended import KaggleApi
|
6
|
+
from selenium import webdriver
|
7
|
+
from selenium.webdriver.common.by import By
|
8
|
+
from selenium.webdriver.chrome.options import Options
|
9
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
10
|
+
from selenium.webdriver.support import expected_conditions as EC
|
6
11
|
|
7
12
|
class kaggle_data:
|
8
13
|
def list_datasets(self):
|
@@ -42,20 +47,91 @@ class kaggle_data:
|
|
42
47
|
# example: read_data(copy_path='./exp_data')
|
43
48
|
|
44
49
|
|
50
|
+
class JianguoDownloader:
|
51
|
+
def __init__(self, url: str, download_path: str = "./downloads"):
|
52
|
+
self.url = url
|
53
|
+
self.download_path = os.path.abspath(download_path)
|
54
|
+
os.makedirs(self.download_path, exist_ok=True)
|
45
55
|
|
56
|
+
# Configure Chrome options
|
57
|
+
self.chrome_options = Options()
|
58
|
+
prefs = {
|
59
|
+
"download.default_directory": self.download_path,
|
60
|
+
"download.prompt_for_download": False,
|
61
|
+
"download.directory_upgrade": True,
|
62
|
+
"safebrowsing.enabled": True,
|
63
|
+
"profile.default_content_setting_values.automatic_downloads": 1,
|
64
|
+
}
|
65
|
+
self.chrome_options.add_experimental_option("prefs", prefs)
|
46
66
|
|
67
|
+
# Optional stability flags
|
68
|
+
self.chrome_options.add_argument("--disable-gpu")
|
69
|
+
self.chrome_options.add_argument("--no-sandbox")
|
70
|
+
self.chrome_options.add_argument("--disable-dev-shm-usage")
|
71
|
+
self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
|
47
72
|
|
48
|
-
|
49
|
-
|
50
|
-
data = kaggle_data()
|
51
|
-
# Example usage
|
52
|
-
data.list_user_datasets()
|
53
|
-
data.download_data(data_name='letter-libsvm', copy_path='./exp_data/Letter')
|
73
|
+
# Start Chrome
|
74
|
+
self.driver = webdriver.Chrome(options=self.chrome_options)
|
54
75
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
76
|
+
def open_page(self):
|
77
|
+
"""Open the Jianguoyun share page."""
|
78
|
+
print(f"🌐 Opening link: {self.url}")
|
79
|
+
self.driver.get(self.url)
|
80
|
+
|
81
|
+
def click_download_button(self):
|
82
|
+
"""Find and click the download button."""
|
83
|
+
print("🔍 Looking for the download button...")
|
84
|
+
wait = WebDriverWait(self.driver, 30)
|
85
|
+
span = wait.until(
|
86
|
+
EC.presence_of_element_located((By.XPATH, "//span[contains(text(),'下载')]"))
|
87
|
+
)
|
88
|
+
parent = span.find_element(By.XPATH, "./..")
|
89
|
+
self.driver.execute_script("arguments[0].click();", parent)
|
90
|
+
print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
|
91
|
+
|
92
|
+
# If Jianguoyun opens a new tab, switch to it
|
93
|
+
time.sleep(3)
|
94
|
+
if len(self.driver.window_handles) > 1:
|
95
|
+
self.driver.switch_to.window(self.driver.window_handles[-1])
|
96
|
+
print("📂 Switched to download tab.")
|
97
|
+
|
98
|
+
def wait_for_downloads(self, timeout=300):
|
99
|
+
"""Wait until all downloads are finished."""
|
100
|
+
print("⏳ Waiting for downloads to finish...")
|
101
|
+
start_time = time.time()
|
102
|
+
while True:
|
103
|
+
downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
|
104
|
+
if not downloading:
|
105
|
+
print("✅ Download completed!")
|
106
|
+
return True
|
107
|
+
if time.time() - start_time > timeout:
|
108
|
+
print("⏰ Timeout: downloads may not have finished.")
|
109
|
+
return False
|
110
|
+
time.sleep(2)
|
111
|
+
|
112
|
+
def get_latest_file(self):
|
113
|
+
"""Return the most recently downloaded file (if any)."""
|
114
|
+
files = [os.path.join(self.download_path, f) for f in os.listdir(self.download_path)]
|
115
|
+
return max(files, key=os.path.getctime) if files else None
|
116
|
+
|
117
|
+
def close(self):
|
118
|
+
"""Close the browser."""
|
119
|
+
self.driver.quit()
|
120
|
+
print("🚪 Browser closed.")
|
121
|
+
|
122
|
+
def run(self):
|
123
|
+
"""Run the complete download process."""
|
124
|
+
print('*'*50)
|
125
|
+
try:
|
126
|
+
self.open_page()
|
127
|
+
self.click_download_button()
|
128
|
+
self.wait_for_downloads()
|
129
|
+
latest = self.get_latest_file()
|
130
|
+
if latest:
|
131
|
+
print(f"📄 Latest downloaded file: {latest}")
|
132
|
+
except Exception as e:
|
133
|
+
print("❌ Error occurred:", e)
|
134
|
+
finally:
|
135
|
+
self.close()
|
136
|
+
print('*'*50)
|
61
137
|
|
junshan_kit/test.py
CHANGED
@@ -0,0 +1,11 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: junshan_kit
|
3
|
+
Version: 2.1.8
|
4
|
+
Summary: This is an optimization tool.
|
5
|
+
Author-email: Junshan Yin <junshanyin@163.com>
|
6
|
+
Requires-Dist: kaggle==1.7.4.5
|
7
|
+
Requires-Dist: kagglehub==0.3.13
|
8
|
+
Requires-Dist: numpy==2.2.6
|
9
|
+
Requires-Dist: pandas==2.3.3
|
10
|
+
Requires-Dist: scikit-learn==1.7.1
|
11
|
+
Requires-Dist: selenium==4.36.0
|
@@ -0,0 +1,7 @@
|
|
1
|
+
junshan_kit/DataProcessor.py,sha256=AW_1jROexC3s41-RgzqzYVwPI0sOf3tzjiph4qa_Vcw,3882
|
2
|
+
junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
junshan_kit/datahub.py,sha256=mofbkp8ry6_LM_vW1LcZolp5tfkqOp_cUiwjfDFbRqI,5153
|
4
|
+
junshan_kit/test.py,sha256=uSckjcr_Wgj__YPTwD6x0GY8Hfn5GBEXIpRf9vIYBbU,91
|
5
|
+
junshan_kit-2.1.8.dist-info/METADATA,sha256=eFQmrVEUORZRhZqBCOlctfSU3vwCQ2RB4Jpyj1coAmE,329
|
6
|
+
junshan_kit-2.1.8.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
|
7
|
+
junshan_kit-2.1.8.dist-info/RECORD,,
|
@@ -1,31 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: junshan_kit
|
3
|
-
Version: 2.1.6
|
4
|
-
Summary: This is an optimization tool.
|
5
|
-
Author-email: Junshan Yin <junshanyin@163.com>
|
6
|
-
Requires-Dist: kaggle==1.7.4.5
|
7
|
-
Requires-Dist: kagglehub==0.3.13
|
8
|
-
Requires-Dist: numpy==2.2.6
|
9
|
-
Requires-Dist: pandas==2.3.3
|
10
|
-
Requires-Dist: scikit-learn==1.7.1
|
11
|
-
Description-Content-Type: text/markdown
|
12
|
-
|
13
|
-
- For class kaggle_data in datahub
|
14
|
-
- We need to set API of kaggle.
|
15
|
-
|
16
|
-
```python
|
17
|
-
import junshan_kit.datahub
|
18
|
-
data = junshan_kit.datahub.kaggle_data()
|
19
|
-
data.list_user_datasets()
|
20
|
-
data.read_data(data_name='letter-libsvm', copy_path='./exp_data/Letter')
|
21
|
-
```
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
@@ -1,7 +0,0 @@
|
|
1
|
-
junshan_kit/DataProcessor.py,sha256=9mlLYxdDiMX7baZmfJk5QuxT4vx_V728XIFbkXmCP0s,1594
|
2
|
-
junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
junshan_kit/datahub.py,sha256=BWcG_TPW1xf_y_GzxRXanuOAB01WugBiO5r53EDbr8s,1815
|
4
|
-
junshan_kit/test.py,sha256=aEaobINtr4Ri0jX6D8u49xgftyA6SE12wx0P6m5x-2w,90
|
5
|
-
junshan_kit-2.1.6.dist-info/METADATA,sha256=KZTS690qvlgOduiYwo6oqshsk4dqY8m9HcVtWu-aXTI,599
|
6
|
-
junshan_kit-2.1.6.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
|
7
|
-
junshan_kit-2.1.6.dist-info/RECORD,,
|
File without changes
|