bm-preprocessing 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/PKG-INFO +1 -1
  2. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/USAGE.md +17 -1
  3. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/pyproject.toml +1 -1
  4. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/__init__.py +4 -1
  5. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/id3.py +30 -0
  6. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/id3_test.py +30 -0
  7. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/sources/id3.py +134 -0
  8. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/sources/id3_test.py +148 -0
  9. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/sources/tennis.csv +15 -0
  10. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/.gitignore +0 -0
  11. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/README.md +0 -0
  12. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/all.py +0 -0
  13. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/apriori.py +0 -0
  14. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/hash.py +0 -0
  15. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/hunts.py +0 -0
  16. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/hunts_test.py +0 -0
  17. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/preprocessing.py +0 -0
  18. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/sources/all.py +0 -0
  19. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/sources/apriori.py +0 -0
  20. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/sources/data.csv +0 -0
  21. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/sources/hash.py +0 -0
  22. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/sources/hunts.py +0 -0
  23. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/sources/hunts_test.py +0 -0
  24. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/sources/preprocessing.py +0 -0
  25. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/IR/__init__.py +0 -0
  26. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/IR/all.py +0 -0
  27. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/IR/sources/all.py +0 -0
  28. {bm_preprocessing-0.3.0 → bm_preprocessing-0.4.0}/src/bm_preprocessing/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bm-preprocessing
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: A package to preprocess text data
5
5
  Requires-Python: >=3.8
6
6
  Requires-Dist: build>=1.2.2.post1
@@ -15,7 +15,7 @@ Create a file `example.py`:
15
15
  ```python
16
16
  # Import modules
17
17
  from bm_preprocessing.IR import all
18
- from bm_preprocessing.DM import apriori, hash, hunts, hunts_test, preprocessing
18
+ from bm_preprocessing.DM import apriori, hash, hunts, hunts_test, id3, id3_test, preprocessing
19
19
 
20
20
  # Print the source code
21
21
  print("=== IR All Module ===")
@@ -33,6 +33,12 @@ print(hunts)
33
33
  print("\n=== DM Hunts Test Module ===")
34
34
  print(hunts_test)
35
35
 
36
+ print("\n=== DM ID3 Module ===")
37
+ print(id3)
38
+
39
+ print("\n=== DM ID3 Test Module ===")
40
+ print(id3_test)
41
+
36
42
  print("\n=== DM Preprocessing Module ===")
37
43
  print(preprocessing)
38
44
  ```
@@ -66,6 +72,12 @@ Then in the Python REPL:
66
72
  # Prints entire DM/hunts.py source code
67
73
  >>> print(hunts_test)
68
74
  # Prints entire DM/hunts_test.py source code
75
+
76
+ >>> from bm_preprocessing.DM import id3, id3_test
77
+ >>> print(id3)
78
+ # Prints entire DM/id3.py source code
79
+ >>> print(id3_test)
80
+ # Prints entire DM/id3_test.py source code
69
81
  ```
70
82
 
71
83
  ---
@@ -78,6 +90,8 @@ python -c "from bm_preprocessing.DM import apriori; print(apriori)"
78
90
  python -c "from bm_preprocessing.DM import hash; print(hash)"
79
91
  python -c "from bm_preprocessing.DM import hunts; print(hunts)"
80
92
  python -c "from bm_preprocessing.DM import hunts_test; print(hunts_test)"
93
+ python -c "from bm_preprocessing.DM import id3; print(id3)"
94
+ python -c "from bm_preprocessing.DM import id3_test; print(id3_test)"
81
95
  python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
82
96
  ```
83
97
 
@@ -93,4 +107,6 @@ python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
93
107
  | `from bm_preprocessing.DM import hash` | Hash-based mining |
94
108
  | `from bm_preprocessing.DM import hunts` | Hunt's decision tree algorithm |
95
109
  | `from bm_preprocessing.DM import hunts_test` | Hunt's decision tree with visualization |
110
+ | `from bm_preprocessing.DM import id3` | ID3 decision tree algorithm |
111
+ | `from bm_preprocessing.DM import id3_test` | ID3 decision tree with visualization |
96
112
  | `from bm_preprocessing.DM import preprocessing` | Data preprocessing utilities |
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "bm-preprocessing"
7
- version = "0.3.0"
7
+ version = "0.4.0"
8
8
  description = "A package to preprocess text data"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -5,6 +5,9 @@ from .apriori import apriori
5
5
  from .hash import hash
6
6
  from .hunts import hunts
7
7
  from .hunts_test import hunts_test
8
+ from .id3 import id3
9
+ from .id3_test import id3_test
8
10
  from .preprocessing import preprocessing
9
11
 
10
- __all__ = ["all", "apriori", "hash", "hunts", "hunts_test", "preprocessing"]
12
+ __all__ = ["all", "apriori", "hash", "hunts", "hunts_test", "id3", "id3_test", "preprocessing"]
13
+
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/id3.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "id3.py"
30
+ id3 = SourceCodeModule("DM.id3", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/id3_test.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "id3_test.py"
30
+ id3_test = SourceCodeModule("DM.id3_test", _source_file)
@@ -0,0 +1,134 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import os
4
+
5
+
6
+ # Load and process data
7
+ data_path = os.path.join(os.path.dirname(__file__), "data.csv")
8
+ df = pd.read_csv(data_path)
9
+ df["Annual Income"] = (
10
+ df["Annual Income"]
11
+ .astype(str)
12
+ .str.replace("K", "", regex=False)
13
+ .str.replace(" ", "", regex=False)
14
+ .astype(int)
15
+ * 1000
16
+ )
17
+
18
+
19
+ # Entropy calculation
20
+ def entropy(df, target_column):
21
+ counts = df[target_column].value_counts()
22
+ probs = counts / len(df)
23
+ return -sum(probs * np.log2(probs))
24
+
25
+
26
+ # Information gain calculation
27
+ def information_gain(df, feature, target_column):
28
+ total_entropy = entropy(df, target_column)
29
+
30
+ values = df[feature].unique()
31
+
32
+ weighted_entropy = 0
33
+ for value in values:
34
+ subset = df[df[feature] == value]
35
+ weighted_entropy += (len(subset) / len(df)) * entropy(subset, target_column)
36
+
37
+ return total_entropy - weighted_entropy
38
+
39
+
40
+ # Best feature selection
41
+ def best_feature(df, feature_columns, target_column):
42
+ gains = {
43
+ feature: information_gain(df, feature, target_column)
44
+ for feature in feature_columns
45
+ }
46
+ return max(gains, key=gains.get)
47
+
48
+
49
+ # Node class
50
+ class ID3Node:
51
+ def __init__(self, feature=None, value=None, label=None):
52
+ self.feature = feature
53
+ self.value = value
54
+ self.children = {}
55
+ self.label = label
56
+
57
+ def is_leaf(self):
58
+ return self.label is not None
59
+
60
+
61
+ # ID3 algorithm
62
+ def id3(df, target_column, feature_columns):
63
+ # If the target column is pure, return a leaf node
64
+ if len(df[target_column].unique()) == 1:
65
+ return ID3Node(label=df[target_column].mode()[0])
66
+
67
+ # If no features left, return leaf with majority class
68
+ if not feature_columns:
69
+ return ID3Node(label=df[target_column].mode()[0])
70
+
71
+ feature = best_feature(df, feature_columns, target_column)
72
+ node = ID3Node(feature=feature)
73
+
74
+ if pd.api.types.is_numeric_dtype(df[feature]):
75
+ median_value = df[feature].median()
76
+ left_df = df[df[feature] <= median_value]
77
+ right_df = df[df[feature] > median_value]
78
+
79
+ node.value = f"{feature} <= {median_value}"
80
+
81
+ remaining_features = [col for col in feature_columns if col != feature]
82
+ node.children["<= " + str(median_value)] = id3(
83
+ left_df, target_column, remaining_features
84
+ )
85
+ node.children["> " + str(median_value)] = id3(
86
+ right_df, target_column, remaining_features
87
+ )
88
+ else:
89
+ unique_vals = df[feature].unique()
90
+ for val in unique_vals:
91
+ subset = df[df[feature] == val]
92
+ remaining_features = [col for col in feature_columns if col != feature]
93
+ node.children[val] = id3(subset, target_column, remaining_features)
94
+
95
+ return node
96
+
97
+
98
+ # Print tree function
99
+ def print_id3_tree(node, indent=""):
100
+ if node.is_leaf():
101
+ print(f"{indent}Leaf: {node.label}")
102
+ return
103
+
104
+ if node.value:
105
+ print(f"{indent}[Numeric Split] {node.value}")
106
+ else:
107
+ print(f"{indent}[Categorical Split] {node.feature}")
108
+
109
+ for val, child in node.children.items():
110
+ print(f"{indent}--> {val}:")
111
+ print_id3_tree(child, indent + " ")
112
+
113
+
114
+ def main():
115
+ feature_columns = [col for col in df.columns if col not in ["Default id", "Tid"]]
116
+
117
+ tree_root = id3(df, target_column="Default id", feature_columns=feature_columns)
118
+
119
+ print("=== ID3 Algorithm - Decision Tree (data.csv) ===\n")
120
+ print_id3_tree(tree_root)
121
+
122
+ # Tennis dataset
123
+ tennis_path = os.path.join(os.path.dirname(__file__), "tennis.csv")
124
+ tennis_df = pd.read_csv(tennis_path)
125
+
126
+ tennis_features = [col for col in tennis_df.columns if col != "Play"]
127
+ tennis_tree = id3(tennis_df, target_column="Play", feature_columns=tennis_features)
128
+
129
+ print("\n\n=== ID3 Algorithm - Decision Tree (tennis.csv) ===\n")
130
+ print_id3_tree(tennis_tree)
131
+
132
+
133
+ if __name__ == "__main__":
134
+ main()
@@ -0,0 +1,148 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import graphviz
4
+ import os
5
+
6
+
7
+ # Load and process data
8
+ data_path = os.path.join(os.path.dirname(__file__), "data.csv")
9
+ df = pd.read_csv(data_path)
10
+ df["Annual Income"] = (
11
+ df["Annual Income"]
12
+ .astype(str)
13
+ .str.replace("K", "", regex=False)
14
+ .str.replace(" ", "", regex=False)
15
+ .astype(int)
16
+ * 1000
17
+ )
18
+
19
+
20
+ # Entropy calculation
21
+ def entropy(df, target_column):
22
+ counts = df[target_column].value_counts()
23
+ probs = counts / len(df)
24
+ return -sum(probs * np.log2(probs))
25
+
26
+
27
+ # Information gain calculation
28
+ def information_gain(df, feature, target_column):
29
+ total_entropy = entropy(df, target_column)
30
+
31
+ values = df[feature].unique()
32
+
33
+ weighted_entropy = 0
34
+ for value in values:
35
+ subset = df[df[feature] == value]
36
+ weighted_entropy += (len(subset) / len(df)) * entropy(subset, target_column)
37
+
38
+ return total_entropy - weighted_entropy
39
+
40
+
41
+ # Best feature selection
42
+ def best_feature(df, feature_columns, target_column):
43
+ gains = {
44
+ feature: information_gain(df, feature, target_column)
45
+ for feature in feature_columns
46
+ }
47
+ return max(gains, key=gains.get)
48
+
49
+
50
+ # Node class
51
+ class ID3Node:
52
+ def __init__(self, feature=None, value=None, label=None):
53
+ self.feature = feature
54
+ self.value = value
55
+ self.children = {}
56
+ self.label = label
57
+
58
+ def is_leaf(self):
59
+ return self.label is not None
60
+
61
+
62
+ # ID3 algorithm
63
+ def id3(df, target_column, feature_columns):
64
+ # If the target column is pure, return a leaf node
65
+ if len(df[target_column].unique()) == 1:
66
+ return ID3Node(label=df[target_column].mode()[0])
67
+
68
+ # If no features left, return leaf with majority class
69
+ if not feature_columns:
70
+ return ID3Node(label=df[target_column].mode()[0])
71
+
72
+ feature = best_feature(df, feature_columns, target_column)
73
+ node = ID3Node(feature=feature)
74
+
75
+ if pd.api.types.is_numeric_dtype(df[feature]):
76
+ median_value = df[feature].median()
77
+ left_df = df[df[feature] <= median_value]
78
+ right_df = df[df[feature] > median_value]
79
+
80
+ node.value = f"{feature} <= {median_value}"
81
+
82
+ remaining_features = [col for col in feature_columns if col != feature]
83
+ node.children["<= " + str(median_value)] = id3(
84
+ left_df, target_column, remaining_features
85
+ )
86
+ node.children["> " + str(median_value)] = id3(
87
+ right_df, target_column, remaining_features
88
+ )
89
+ else:
90
+ unique_vals = df[feature].unique()
91
+ for val in unique_vals:
92
+ subset = df[df[feature] == val]
93
+ remaining_features = [col for col in feature_columns if col != feature]
94
+ node.children[val] = id3(subset, target_column, remaining_features)
95
+
96
+ return node
97
+
98
+
99
+ # Tree visualization function using graphviz
100
+ def visualize_id3_tree(node, parent_name="Root", graph=None):
101
+ if graph is None:
102
+ graph = graphviz.Digraph(format="png", engine="dot")
103
+
104
+ if node.is_leaf():
105
+ graph.node(parent_name, label=str(node.label), shape="ellipse")
106
+ else:
107
+ if node.value:
108
+ label = node.value
109
+ else:
110
+ label = str(node.feature)
111
+ graph.node(parent_name, label=label, shape="box")
112
+
113
+ for val, child in node.children.items():
114
+ child_name = f"{parent_name}_{val}"
115
+ graph.edge(parent_name, child_name, label=str(val))
116
+ visualize_id3_tree(child, child_name, graph)
117
+
118
+ return graph
119
+
120
+
121
+ def main():
122
+ feature_columns = [col for col in df.columns if col not in ["Default id", "Tid"]]
123
+
124
+ tree_root = id3(df, target_column="Default id", feature_columns=feature_columns)
125
+
126
+ # Visualize the tree using graphviz
127
+ graph = visualize_id3_tree(tree_root)
128
+ output_path = os.path.join(os.path.dirname(__file__), "id3_decision_tree")
129
+ graph.render(output_path, view=True, cleanup=True)
130
+ print(f"Decision tree rendered and saved as '{output_path}.png'")
131
+
132
+ # Tennis dataset
133
+ tennis_path = os.path.join(os.path.dirname(__file__), "tennis.csv")
134
+ tennis_df = pd.read_csv(tennis_path)
135
+
136
+ tennis_features = [col for col in tennis_df.columns if col != "Play"]
137
+ tennis_tree = id3(tennis_df, target_column="Play", feature_columns=tennis_features)
138
+
139
+ graph_tennis = visualize_id3_tree(tennis_tree)
140
+ tennis_output_path = os.path.join(
141
+ os.path.dirname(__file__), "id3_tennis_decision_tree"
142
+ )
143
+ graph_tennis.render(tennis_output_path, view=True, cleanup=True)
144
+ print(f"Tennis decision tree rendered and saved as '{tennis_output_path}.png'")
145
+
146
+
147
+ if __name__ == "__main__":
148
+ main()
@@ -0,0 +1,15 @@
1
+ Outlook,Temperature,Humidity,Wind,Play
2
+ Sunny,Hot,High,Weak,No
3
+ Sunny,Hot,High,Strong,No
4
+ Overcast,Hot,High,Weak,Yes
5
+ Rain,Mild,High,Weak,Yes
6
+ Rain,Cool,Normal,Weak,Yes
7
+ Rain,Cool,Normal,Strong,No
8
+ Overcast,Cool,Normal,Strong,Yes
9
+ Sunny,Mild,High,Weak,No
10
+ Sunny,Cool,Normal,Weak,Yes
11
+ Rain,Mild,Normal,Weak,Yes
12
+ Sunny,Mild,Normal,Strong,Yes
13
+ Overcast,Mild,High,Strong,Yes
14
+ Overcast,Hot,Normal,Weak,Yes
15
+ Rain,Mild,High,Strong,No