junshan-kit 2.1.6__py2.py3-none-any.whl → 2.1.8__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
 
2
2
  import pandas as pd
3
3
  import os
4
-
4
+ from sklearn.preprocessing import StandardScaler
5
5
  import junshan_kit.datahub
6
6
 
7
7
  class CSVToPandas:
@@ -16,6 +16,7 @@ class CSVToPandas:
16
16
 
17
17
  # ----------------- ccfd_kaggle ----------------------------------
18
18
  def ccfd_kaggle(self, data_name = 'ccfd-kaggle', show_info = True):
19
+ # download data if not exist
19
20
  self.read_csv(data_name)
20
21
 
21
22
  df = pd.read_csv(self.csv_path)
@@ -33,13 +34,72 @@ class CSVToPandas:
33
34
  print('='*60)
34
35
  print(f"{'Original size:':<25} {m_before} rows x {n_before} cols")
35
36
  print(f"{'Size after dropping NaNs:':<25} {m_after} rows x {n_after} cols")
36
- print(f"{'Export size:':<25} {m_after} rows x {n_after} cols")
37
37
  print(f"{'Positive samples (+1):':<25} {pos_count}")
38
38
  print(f"{'Negative samples (-1):':<25} {neg_count}")
39
+ print(f"{'Export size:':<25} {m_after} rows x {n_after} cols")
39
40
  print('-'*60)
40
41
  print(f"More details: https://www.jianguoyun.com/p/Dd1clVgQ4ZThCxiwzZQGIAA")
41
42
  print('='*60 + '\n')
42
43
 
43
44
  return df
45
+
46
+ # ------------------------
47
+ def ghpdd_kaggle(self, data_name='ghpdd-kaggle', show_info=True):
48
+ # download data if not exist
49
+ self.read_csv(data_name)
50
+
51
+ # read csv
52
+ df = pd.read_csv(self.csv_path)
53
+ m_before, n_before = df.shape
54
+
55
+ # drop NaNs
56
+ df = df.dropna(axis=0, how='any')
57
+ m_after, n_after = df.shape
58
+
59
+ # drop unique identifier
60
+ if 'property_id' in df.columns:
61
+ df.drop(columns=['property_id'], inplace=True)
62
+
63
+ # Replace label 0 with -1
64
+ df['decision'] = df['decision'].replace(0, -1)
65
+
66
+ # Identify categorical and numerical columns
67
+ cat_cols = ['country', 'city', 'property_type', 'furnishing_status']
68
+ num_cols = [col for col in df.columns if col not in cat_cols + ['decision']]
69
+
70
+ # One-Hot encode categorical columns
71
+ df = pd.get_dummies(df, columns=cat_cols)
72
+
73
+ # Convert boolean columns to int
74
+ bool_cols = df.select_dtypes(include='bool').columns
75
+ for col in bool_cols:
76
+ df[col] = df[col].astype(int)
77
+
78
+ # Standardize numerical columns
79
+ scaler = StandardScaler()
80
+ df[num_cols] = scaler.fit_transform(df[num_cols])
81
+
82
+ # 导出后的大小
83
+ m_export, n_export = df.shape
84
+
85
+ if show_info:
86
+ pos_count = (df['decision'] == 1).sum()
87
+ neg_count = (df['decision'] == -1).sum()
88
+
89
+ print('\n' + '='*70)
90
+ print(f"{'GHPDD-Kaggle Dataset Info':^70}")
91
+ print('='*70)
92
+ print(f"{'Original size:':<35} {m_before} rows x {n_before} cols")
93
+ print(f"{'Size after dropping NaNs:':<35} {m_after} rows x {n_after} cols")
94
+ print(f"{'Positive samples (+1):':<35} {pos_count}")
95
+ print(f"{'Negative samples (-1):':<35} {neg_count}")
96
+ print(f"{'Export size (after encoding & scaling):':<35} {m_export} rows x {n_export} cols")
97
+ print('-'*70)
98
+ print(f"{'Note: categorical columns one-hot encoded, numerical standardized.'}")
99
+ print(f"More details: https://www.jianguoyun.com/p/DU6Lr9oQqdHDDRj5sI0GIAA")
100
+ print('='*70 + '\n')
101
+
102
+ return df
103
+
44
104
 
45
105
 
junshan_kit/datahub.py CHANGED
@@ -1,8 +1,13 @@
1
1
  import kagglehub
2
- import os
2
+ import os, time
3
3
  import warnings
4
4
  import shutil
5
5
  from kaggle.api.kaggle_api_extended import KaggleApi
6
+ from selenium import webdriver
7
+ from selenium.webdriver.common.by import By
8
+ from selenium.webdriver.chrome.options import Options
9
+ from selenium.webdriver.support.ui import WebDriverWait
10
+ from selenium.webdriver.support import expected_conditions as EC
6
11
 
7
12
  class kaggle_data:
8
13
  def list_datasets(self):
@@ -42,20 +47,91 @@ class kaggle_data:
42
47
  # example: read_data(copy_path='./exp_data')
43
48
 
44
49
 
50
+ class JianguoDownloader:
51
+ def __init__(self, url: str, download_path: str = "./downloads"):
52
+ self.url = url
53
+ self.download_path = os.path.abspath(download_path)
54
+ os.makedirs(self.download_path, exist_ok=True)
45
55
 
56
+ # Configure Chrome options
57
+ self.chrome_options = Options()
58
+ prefs = {
59
+ "download.default_directory": self.download_path,
60
+ "download.prompt_for_download": False,
61
+ "download.directory_upgrade": True,
62
+ "safebrowsing.enabled": True,
63
+ "profile.default_content_setting_values.automatic_downloads": 1,
64
+ }
65
+ self.chrome_options.add_experimental_option("prefs", prefs)
46
66
 
67
+ # Optional stability flags
68
+ self.chrome_options.add_argument("--disable-gpu")
69
+ self.chrome_options.add_argument("--no-sandbox")
70
+ self.chrome_options.add_argument("--disable-dev-shm-usage")
71
+ self.chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")
47
72
 
48
- if __name__ == "__main__":
49
- # Your code here
50
- data = kaggle_data()
51
- # Example usage
52
- data.list_user_datasets()
53
- data.download_data(data_name='letter-libsvm', copy_path='./exp_data/Letter')
73
+ # Start Chrome
74
+ self.driver = webdriver.Chrome(options=self.chrome_options)
54
75
 
55
- """
56
- import junshan_kit.datahub
57
- data = junshan_kit.datahub.kaggle_data()
58
- data.list_user_datasets()
59
- data.read_data(data_name='letter-libsvm', copy_path='./exp_data/Letter')
60
- """
76
+ def open_page(self):
77
+ """Open the Jianguoyun share page."""
78
+ print(f"🌐 Opening link: {self.url}")
79
+ self.driver.get(self.url)
80
+
81
+ def click_download_button(self):
82
+ """Find and click the download button."""
83
+ print("🔍 Looking for the download button...")
84
+ wait = WebDriverWait(self.driver, 30)
85
+ span = wait.until(
86
+ EC.presence_of_element_located((By.XPATH, "//span[contains(text(),'下载')]"))
87
+ )
88
+ parent = span.find_element(By.XPATH, "./..")
89
+ self.driver.execute_script("arguments[0].click();", parent)
90
+ print(f"✅ Download button clicked. Files will be saved to: {self.download_path}")
91
+
92
+ # If Jianguoyun opens a new tab, switch to it
93
+ time.sleep(3)
94
+ if len(self.driver.window_handles) > 1:
95
+ self.driver.switch_to.window(self.driver.window_handles[-1])
96
+ print("📂 Switched to download tab.")
97
+
98
+ def wait_for_downloads(self, timeout=300):
99
+ """Wait until all downloads are finished."""
100
+ print("⏳ Waiting for downloads to finish...")
101
+ start_time = time.time()
102
+ while True:
103
+ downloading = [f for f in os.listdir(self.download_path) if f.endswith(".crdownload")]
104
+ if not downloading:
105
+ print("✅ Download completed!")
106
+ return True
107
+ if time.time() - start_time > timeout:
108
+ print("⏰ Timeout: downloads may not have finished.")
109
+ return False
110
+ time.sleep(2)
111
+
112
+ def get_latest_file(self):
113
+ """Return the most recently downloaded file (if any)."""
114
+ files = [os.path.join(self.download_path, f) for f in os.listdir(self.download_path)]
115
+ return max(files, key=os.path.getctime) if files else None
116
+
117
+ def close(self):
118
+ """Close the browser."""
119
+ self.driver.quit()
120
+ print("🚪 Browser closed.")
121
+
122
+ def run(self):
123
+ """Run the complete download process."""
124
+ print('*'*50)
125
+ try:
126
+ self.open_page()
127
+ self.click_download_button()
128
+ self.wait_for_downloads()
129
+ latest = self.get_latest_file()
130
+ if latest:
131
+ print(f"📄 Latest downloaded file: {latest}")
132
+ except Exception as e:
133
+ print("❌ Error occurred:", e)
134
+ finally:
135
+ self.close()
136
+ print('*'*50)
61
137
 
junshan_kit/test.py CHANGED
@@ -1,5 +1,3 @@
1
- import DataProcessor
1
+ from datahub import JianguoDownloader
2
2
 
3
-
4
- data_loader = DataProcessor.CSVToPandas()
5
- data_loader.ccfd_kaggle()
3
+ data2 = JianguoDownloader('www.lka.com', './expspe')
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: junshan_kit
3
+ Version: 2.1.8
4
+ Summary: This is an optimization tool.
5
+ Author-email: Junshan Yin <junshanyin@163.com>
6
+ Requires-Dist: kaggle==1.7.4.5
7
+ Requires-Dist: kagglehub==0.3.13
8
+ Requires-Dist: numpy==2.2.6
9
+ Requires-Dist: pandas==2.3.3
10
+ Requires-Dist: scikit-learn==1.7.1
11
+ Requires-Dist: selenium==4.36.0
@@ -0,0 +1,7 @@
1
+ junshan_kit/DataProcessor.py,sha256=AW_1jROexC3s41-RgzqzYVwPI0sOf3tzjiph4qa_Vcw,3882
2
+ junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ junshan_kit/datahub.py,sha256=mofbkp8ry6_LM_vW1LcZolp5tfkqOp_cUiwjfDFbRqI,5153
4
+ junshan_kit/test.py,sha256=uSckjcr_Wgj__YPTwD6x0GY8Hfn5GBEXIpRf9vIYBbU,91
5
+ junshan_kit-2.1.8.dist-info/METADATA,sha256=eFQmrVEUORZRhZqBCOlctfSU3vwCQ2RB4Jpyj1coAmE,329
6
+ junshan_kit-2.1.8.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
7
+ junshan_kit-2.1.8.dist-info/RECORD,,
@@ -1,31 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: junshan_kit
3
- Version: 2.1.6
4
- Summary: This is an optimization tool.
5
- Author-email: Junshan Yin <junshanyin@163.com>
6
- Requires-Dist: kaggle==1.7.4.5
7
- Requires-Dist: kagglehub==0.3.13
8
- Requires-Dist: numpy==2.2.6
9
- Requires-Dist: pandas==2.3.3
10
- Requires-Dist: scikit-learn==1.7.1
11
- Description-Content-Type: text/markdown
12
-
13
- - For class kaggle_data in datahub
14
- - We need to set API of kaggle.
15
-
16
- ```python
17
- import junshan_kit.datahub
18
- data = junshan_kit.datahub.kaggle_data()
19
- data.list_user_datasets()
20
- data.read_data(data_name='letter-libsvm', copy_path='./exp_data/Letter')
21
- ```
22
-
23
-
24
-
25
-
26
-
27
-
28
-
29
-
30
-
31
-
@@ -1,7 +0,0 @@
1
- junshan_kit/DataProcessor.py,sha256=9mlLYxdDiMX7baZmfJk5QuxT4vx_V728XIFbkXmCP0s,1594
2
- junshan_kit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- junshan_kit/datahub.py,sha256=BWcG_TPW1xf_y_GzxRXanuOAB01WugBiO5r53EDbr8s,1815
4
- junshan_kit/test.py,sha256=aEaobINtr4Ri0jX6D8u49xgftyA6SE12wx0P6m5x-2w,90
5
- junshan_kit-2.1.6.dist-info/METADATA,sha256=KZTS690qvlgOduiYwo6oqshsk4dqY8m9HcVtWu-aXTI,599
6
- junshan_kit-2.1.6.dist-info/WHEEL,sha256=tkmg4JIqwd9H8mL30xA7crRmoStyCtGp0VWshokd1Jc,105
7
- junshan_kit-2.1.6.dist-info/RECORD,,