hadup 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hadup-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.4
2
+ Name: hadup
3
+ Version: 0.1.0
4
+ Summary: hadup Library made by Mannny
5
+ Author-email: Manny <Mannny@mannny.com>
6
+ License: MIT
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+
10
+ <!-- hadup Library made by Mannny -->
hadup-0.1.0/README.md ADDED
@@ -0,0 +1 @@
1
+ <!-- hadup Library made by Mannny -->
@@ -0,0 +1,2 @@
1
+ """hadup Library made by Mannny"""
2
+ __version__ = "0.1.0"
@@ -0,0 +1,85 @@
1
+ P1_CODE = '''
2
+ # 1 Create a Single Node Cluster in Pseudo-Distributed Mode
3
+ 1. Introduction
4
+ Pseudo-Distributed Mode is a Hadoop setup where all Hadoop daemons run on a single machine but in separate JVM processes. It simulates a real distributed cluster while using only one system.
5
+ It is mainly used for learning, testing, and development purposes.
6
+
7
+ 2. Key Features
8
+ Each daemon (NameNode, DataNode, ResourceManager, NodeManager) runs separately.
9
+ Data is stored in HDFS but physically resides on the local disk.
10
+ Communication happens through localhost (127.0.0.1).
11
+
12
+ 3. Hadoop Components Configured
13
+ HDFS
14
+ NameNode: Manages metadata and directory structure.
15
+ DataNode: Stores actual data blocks.
16
+ YARN
17
+ ResourceManager: Manages cluster resources.
18
+ NodeManager: Monitors resources and runs tasks.
19
+
20
+ 4. Prerequisites
21
+ Ubuntu OS
22
+ Java 8 installed
23
+ SSH configured (passwordless login)
24
+ Hadoop extracted in /home/<user>/hadoop
25
+
26
+ 5. Configuration Steps
27
+ 1. Verify Java
28
+ java -version
29
+ 2. Install Java (if needed)
30
+ sudo apt install openjdk-8-jdk -y
31
+ 3. Set JAVA_HOME
32
+
33
+ Edit hadoop-env.sh and add:
34
+ export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
35
+
36
+ 4. Configure core-site.xml
37
+
38
+ 5. Configure hdfs-site.xml
39
+
40
+ 6. Configure mapred-site.xml
41
+
42
+ 7. Configure yarn-site.xml
43
+
44
+ 6. Format HDFS
45
+ ~/hadoop/bin/hdfs namenode -format
46
+
47
+ 7. Start Hadoop Services
48
+
49
+ Start HDFS:
50
+ ~/hadoop/sbin/start-dfs.sh
51
+ Start YARN:
52
+ ~/hadoop/sbin/start-yarn.sh
53
+
54
+ 8. Verify Services
55
+ jps
56
+
57
+ Expected services:
58
+ NameNode
59
+ DataNode
60
+ SecondaryNameNode
61
+ ResourceManager
62
+ NodeManager
63
+
64
+ Web Interfaces:
65
+ HDFS UI: http://localhost:9870
66
+ YARN UI: http://localhost:8088
67
+
68
+ 9. Benefits
69
+
70
+ No extra hardware required
71
+ Ideal for learning and testing
72
+ Same configuration as production clusters
73
+ Fast debugging and experimentation
74
+
75
+ 10. Conclusion
76
+ Hadoop Pseudo-Distributed Mode allows users to simulate a real Hadoop cluster on a single machine. It is widely used in educational labs and development environments to test Big Data applications without requiring large infrastructure.
77
+
78
+ '''
79
+
80
+ def main():
81
+ # print("")
82
+ print(P1_CODE)
83
+
84
+ if __name__ == "__main__":
85
+ main()
@@ -0,0 +1,10 @@
1
+ P10_CONTENT = '''
2
+
3
+ '''
4
+
5
+ def main():
6
+ # print("")
7
+ print(P10_CONTENT)
8
+
9
+ if __name__ == "__main__":
10
+ main()
@@ -0,0 +1,79 @@
1
+ P2_CODE = '''
2
+ # 2 Introduction to Hadoop
3
+
4
+ Hadoop is a Big Data framework used to store and process large amounts of data.
5
+ It works on distributed systems using multiple computers.
6
+ Hadoop is scalable and fault tolerant.
7
+
8
+ Hadoop has two main components:
9
+ HDFS – used for storing large data.
10
+ MapReduce – used for processing data in parallel.
11
+
12
+ Starting Hadoop Services
13
+ Start the Hadoop daemons using:
14
+
15
+ hdfs --daemon start namenode
16
+ hdfs --daemon start datanode
17
+ yarn --daemon start resourcemanager
18
+ yarn --daemon start nodemanager
19
+
20
+ NameNode manages metadata.
21
+ DataNode stores data blocks.
22
+ ResourceManager manages cluster resources.
23
+ NodeManager runs tasks.
24
+
25
+ Verify Services
26
+ jps
27
+
28
+ NameNode, DataNode, ResourceManager, and NodeManager should appear.
29
+
30
+ Web Interface
31
+ HDFS UI: http://localhost:9870
32
+ YARN UI: http://localhost:8088
33
+
34
+
35
+ HDFS Commands
36
+
37
+ Check version:
38
+ hadoop version
39
+
40
+ List root directory:
41
+ hdfs dfs -ls /
42
+
43
+ Create directory:
44
+ hdfs dfs -mkdir /data
45
+
46
+ Create local file:
47
+ echo "Hello Hadoop" > local.txt
48
+
49
+ Upload file:
50
+ hdfs dfs -put local.txt /data/
51
+
52
+ List files:
53
+ hdfs dfs -ls /data
54
+
55
+ Download file:
56
+ hdfs dfs -get /data/local.txt
57
+
58
+ Show file content:
59
+ hdfs dfs -cat /data/local.txt
60
+
61
+ Copy file:
62
+ hdfs dfs -cp /data/local.txt /data/local_copy.txt
63
+
64
+ Rename file:
65
+ hdfs dfs -mv /data/local_copy.txt /data/local_renamed.txt
66
+
67
+ Delete file:
68
+ hdfs dfs -rm /data/local_renamed.txt
69
+
70
+ Delete folder:
71
+ hdfs dfs -rm -r /data
72
+ '''
73
+
74
+ def main():
75
+ # print("")
76
+ print(P2_CODE)
77
+
78
+ if __name__ == "__main__":
79
+ main()
@@ -0,0 +1,34 @@
1
+ P3_CODE = '''
2
+ # 3 WordCount 2 reducers
3
+
4
+ Create Input File
5
+ echo "hello hadoop hello mapreduce" > wc_input.txt
6
+
7
+ Create HDFS Directory
8
+ hdfs dfs -mkdir -p /wordcount_input
9
+
10
+ Upload File
11
+ hdfs dfs -put wc_input.txt /wordcount_input/
12
+ hdfs dfs -ls /
13
+
14
+ Run WordCount (2 Reducers)
15
+ hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \
16
+ wordcount \
17
+ -D mapreduce.job.reduces=2 \
18
+ /wordcount_input \
19
+ /wordcount/output_2reducer
20
+
21
+ Check Output Files
22
+ hdfs dfs -ls /wordcount/output_2reducer
23
+
24
+ View Output
25
+ hdfs dfs -cat /wordcount/output_2reducer/part-r-*
26
+ '''
27
+
28
+ def main():
29
+ # print("")
30
+ print(P3_CODE)
31
+
32
+
33
+ if __name__ == "__main__":
34
+ main()
@@ -0,0 +1,35 @@
1
+ P4_CODE = '''
2
+ # 4 Mapper-Only WordCount
3
+
4
+ Create Input File
5
+ echo "hello hadoop hello mapreduce" > wc_input.txt
6
+ echo "mapper only job example in hadoop" >> wc_input.txt
7
+ cat wc_input.txt
8
+
9
+ Create HDFS Directory
10
+ hdfs dfs -mkdir -p /wordcount1/input
11
+
12
+ Upload File
13
+ hdfs dfs -put -f wc_input.txt /wordcount1/input/
14
+ hdfs dfs -ls /wordcount1/input
15
+
16
+ Run Mapper-Only Job
17
+ hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \
18
+ wordcount \
19
+ -D mapreduce.job.reduces=0 \
20
+ /wordcount1/input \
21
+ /wordcount1/output_maponly
22
+
23
+ Check Output
24
+ hdfs dfs -ls /wordcount1/output_maponly
25
+
26
+ View Output
27
+ hdfs dfs -cat /wordcount1/output_maponly/part-m-*
28
+ '''
29
+
30
+ def main():
31
+ # print("")
32
+ print(P4_CODE)
33
+
34
+ if __name__ == "__main__":
35
+ main()
@@ -0,0 +1,10 @@
1
+ P5_CONTENT = '''
2
+ # 5
3
+ '''
4
+
5
+ def main():
6
+ # print("")
7
+ print(P5_CONTENT)
8
+
9
+ if __name__ == "__main__":
10
+ main()
@@ -0,0 +1,79 @@
1
+ P6_CONTENT = '''⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
2
+ # 6 Calculate Highest Temperature for Each Year using Hadoop MapReduce
3
+
4
+ Create Sample Temperature File
5
+ cat > temps.csv <<EOF
6
+ 2000-01-01,25
7
+ 2000-05-10,30
8
+ 2000-12-31,28
9
+ 2001-03-15,35
10
+ 2001-07-20,40
11
+ 2001-11-05,38
12
+ EOF
13
+
14
+ Upload File to HDFS
15
+ hdfs dfs -mkdir -p /maxtemp/input
16
+ hdfs dfs -put -f temps.csv /maxtemp/input/
17
+ hdfs dfs -ls /maxtemp/input
18
+
19
+ Create Mapper (max_temp_mapper.py)
20
+ cat > max_temp_mapper.py <<'EOF'
21
+ import sys
22
+ for line in sys.stdin:
23
+ line=line.strip()
24
+ if not line: continue
25
+ parts=line.split(",")
26
+ if len(parts)!=2: continue
27
+ year=parts[0][:4]
28
+ try:
29
+ temp=float(parts[1])
30
+ except:
31
+ continue
32
+ print(f"{year}\t{temp}")
33
+ EOF
34
+ chmod +x max_temp_mapper.py
35
+
36
+ Create Reducer (max_temp_reducer.py)
37
+ cat > max_temp_reducer.py <<'EOF'
38
+ import sys
39
+ current_year=None
40
+ current_max=None
41
+ for line in sys.stdin:
42
+ line=line.strip()
43
+ if not line: continue
44
+ year,temp=line.split("\t")
45
+ temp=float(temp)
46
+ if current_year==year:
47
+ if temp>current_max:
48
+ current_max=temp
49
+ else:
50
+ if current_year:
51
+ print(f"{current_year}\t{current_max}")
52
+ current_year=year
53
+ current_max=temp
54
+ if current_year:
55
+ print(f"{current_year}\t{current_max}")
56
+ EOF
57
+ chmod +x max_temp_reducer.py
58
+
59
+ Run Hadoop Streaming Job
60
+ hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
61
+ -input /maxtemp/input \
62
+ -output /maxtemp/output_max_temp \
63
+ -mapper "python3 max_temp_mapper.py" \
64
+ -reducer "python3 max_temp_reducer.py" \
65
+ -file max_temp_mapper.py \
66
+ -file max_temp_reducer.py
67
+
68
+ Check Output
69
+ hdfs dfs -ls /maxtemp/output_max_temp
70
+ hdfs dfs -cat /maxtemp/output_max_temp/part-*
71
+ '''
72
+
73
+ def main():
74
+ # print("")
75
+ # print("=" * 70)
76
+ print(P6_CONTENT)
77
+
78
+ if __name__ == "__main__":
79
+ main()
@@ -0,0 +1,80 @@
1
+ P7_CONTENT = '''
2
+ # 7 Study of Apache Sqoop Basic Commands
3
+ 1. Executive Summary
4
+ Organization: RetailFlow Inc.
5
+ Objective: To migrate transactional data from MySQL to Hadoop HDFS for advanced analytics.
6
+ Tools Used: Apache Sqoop 1.4.x, Hadoop 3.x, MySQL 8.0
7
+
8
+ RetailFlow Inc. used Apache Sqoop to transfer data between MySQL and Hadoop in order to reduce database load and improve analytical performance.
9
+
10
+ 2. Problem Statement
11
+ The company handled over 10 million transactions daily. Complex SQL queries caused latency in the MySQL production database.
12
+ The objectives were:
13
+ Move historical data to HDFS.
14
+ Maintain data consistency.
15
+ Automate data transfer between MySQL and Hadoop.
16
+
17
+ 3. Implementation
18
+ Phase 1: Connectivity Check
19
+ List Databases
20
+ sqoop list-databases \
21
+ --connect jdbc:mysql://db-server.retailflow.com \
22
+ --username analytics_user \
23
+ --password-file /user/admin/.password
24
+
25
+ List Tables
26
+ sqoop list-tables \
27
+ --connect jdbc:mysql://db-server.retailflow.com/retail_db \
28
+ --username analytics_user \
29
+ --password-file /user/admin/.password
30
+
31
+ Phase 2: Full Data Import
32
+ sqoop import \
33
+ --connect jdbc:mysql://db-server.retailflow.com/retail_db \
34
+ --table transactions \
35
+ --target-dir /data/raw/transactions \
36
+ --m 4 \
37
+ --fields-terminated-by ','
38
+
39
+ Imported entire table into HDFS using 4 parallel mappers.
40
+
41
+ Phase 3: Selective Import
42
+ sqoop import \
43
+ --connect jdbc:mysql://db-server.retailflow.com/retail_db \
44
+ --table transactions \
45
+ --where "status='COMPLETED' AND trans_date > '2023-01-01'" \
46
+ --target-dir /data/analytics/filtered_transactions \
47
+ --m 1
48
+
49
+ Imported only required records using filter conditions.
50
+
51
+ Phase 4: Export Processed Data
52
+ sqoop export \
53
+ --connect jdbc:mysql://db-server.retailflow.com/retail_db \
54
+ --table loyalty_scores \
55
+ --export-dir /data/output/processed_loyalty \
56
+ --input-fields-terminated-by ',' \
57
+ --update-mode allowinsert
58
+
59
+ Exported processed data back to MySQL.
60
+
61
+ 4. Challenges & Solutions
62
+ Used --password-file for security.
63
+ Adjusted --m to avoid database overload.
64
+ Used --as-parquetfile for better performance and storage efficiency.
65
+
66
+ 5. Results
67
+ 65% reduction in database load.
68
+ 40% storage savings using Parquet.
69
+ Improved data accessibility for analytics.
70
+
71
+ 6. Conclusion
72
+ Apache Sqoop successfully enabled secure and efficient data transfer between MySQL and Hadoop. It reduced system load, improved performance, and created a scalable data pipeline for analytics.
73
+ '''
74
+
75
+ def main():
76
+ # print("")
77
+ print(P7_CONTENT)
78
+
79
+ if __name__ == "__main__":
80
+ main()
@@ -0,0 +1,52 @@
1
+ P8_CONTENT = '''
2
+ # 8 Execute a Flume Configuration File to Extract Data from Twitter
3
+ 1. Executive Summary
4
+
5
+ This case study demonstrates how Apache Flume was used to collect live Twitter data and store it in Hadoop HDFS for sentiment analysis and market trend monitoring. The system ensured reliable, real-time data ingestion.
6
+
7
+ 2. Problem Statement
8
+ The client wanted to analyze brand perception in real time. Key challenges included:
9
+ Handling high tweet volume (velocity)
10
+ Managing unstructured JSON data
11
+ Preventing data loss during failures
12
+
13
+ 3. Technology Stack
14
+ Data Source: Twitter Streaming API
15
+ Tool: Apache Flume 1.9.0
16
+ Storage: Hadoop HDFS (3.x)
17
+ Processing: JSON Interceptors
18
+
19
+ 4. Flume Configuration Overview
20
+ The solution used a Flume agent with:
21
+ Source: TwitterSource (connects to Twitter API using keys and tokens)
22
+ Channel: Memory Channel (buffers tweets temporarily)
23
+ Sink: HDFS Sink (stores tweets in HDFS)
24
+ Tweets containing keywords like BigData, AI, Hadoop, MachineLearning were captured and stored in HDFS using date-based partitioning:
25
+ /user/flume/twitter_data/%Y/%m/%d/
26
+
27
+ 5. Execution Steps
28
+ Ensure Hadoop HDFS is running.
29
+ Place required Twitter libraries in Flume’s lib folder.
30
+ Start the Flume agent using:
31
+ bin/flume-ng agent --conf ./conf/ -f conf/twitter_to_hdfs.conf \
32
+ -Dflume.root.logger=DEBUG,console -n TwitterAgent
33
+
34
+ 6. Challenges & Solutions
35
+ Rate Limiting: Applied keyword filtering to capture relevant tweets only.
36
+ Schema Changes: Stored raw JSON in HDFS using DataStream for flexible analysis later.
37
+
38
+ 7. Results
39
+ No data loss due to memory buffering.
40
+ Scalable architecture (multiple agents can be added).
41
+ Data stored by date for easy time-based analysis.
42
+
43
+ 8. Conclusion
44
+ Apache Flume successfully streamed live Twitter data into HDFS in a reliable and scalable manner. This setup enabled real-time analytics and supported business intelligence reporting.
45
+ '''
46
+
47
+ def main():
48
+ # print("")
49
+ print(P8_CONTENT)
50
+
51
+ if __name__ == "__main__":
52
+ main()
@@ -0,0 +1,134 @@
1
+ P9_CONTENT = '''
2
+
3
+ 1. Introduction
4
+
5
+ Pseudo-Distributed Mode is a Hadoop setup where all Hadoop daemons run on a single machine but in separate JVM processes. It simulates a real distributed cluster while using only one system.
6
+
7
+ It is mainly used for learning, testing, and development purposes.
8
+
9
+ 2. Key Features
10
+
11
+ Each daemon (NameNode, DataNode, ResourceManager, NodeManager) runs separately.
12
+
13
+ Data is stored in HDFS but physically resides on the local disk.
14
+
15
+ Communication happens through localhost (127.0.0.1).
16
+
17
+ 3. Hadoop Components Configured
18
+ HDFS (Storage Layer)
19
+
20
+ NameNode: Manages metadata and directory structure.
21
+
22
+ DataNode: Stores actual data blocks.
23
+
24
+ YARN (Resource Layer)
25
+
26
+ ResourceManager: Manages cluster resources.
27
+
28
+ NodeManager: Monitors resources and runs tasks.
29
+
30
+ 4. Prerequisites
31
+
32
+ Ubuntu OS
33
+
34
+ Java 8 installed
35
+
36
+ SSH configured (passwordless login)
37
+
38
+ Hadoop extracted in /home/<user>/hadoop
39
+
40
+ 5. Configuration Steps
41
+ 1. Verify Java
42
+ java -version
43
+ 2. Install Java (if needed)
44
+ sudo apt install openjdk-8-jdk -y
45
+ 3. Set JAVA_HOME
46
+
47
+ Edit hadoop-env.sh and add:
48
+
49
+ export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
50
+ 4. Configure core-site.xml
51
+ <configuration>
52
+ <property>
53
+ <name>fs.defaultFS</name>
54
+ <value>hdfs://localhost:9000</value>
55
+ </property>
56
+ </configuration>
57
+ 5. Configure hdfs-site.xml
58
+ <configuration>
59
+ <property>
60
+ <name>dfs.replication</name>
61
+ <value>1</value>
62
+ </property>
63
+ </configuration>
64
+ 6. Configure mapred-site.xml
65
+ <configuration>
66
+ <property>
67
+ <name>mapreduce.framework.name</name>
68
+ <value>yarn</value>
69
+ </property>
70
+ </configuration>
71
+ 7. Configure yarn-site.xml
72
+ <configuration>
73
+ <property>
74
+ <name>yarn.nodemanager.aux-services</name>
75
+ <value>mapreduce_shuffle</value>
76
+ </property>
77
+ <property>
78
+ <name>yarn.resourcemanager.address</name>
79
+ <value>localhost:8032</value>
80
+ </property>
81
+ </configuration>
82
+ 6. Format HDFS
83
+ ~/hadoop/bin/hdfs namenode -format
84
+ 7. Start Hadoop Services
85
+
86
+ Start HDFS:
87
+
88
+ ~/hadoop/sbin/start-dfs.sh
89
+
90
+ Start YARN:
91
+
92
+ ~/hadoop/sbin/start-yarn.sh
93
+ 8. Verify Services
94
+ jps
95
+
96
+ Expected services:
97
+
98
+ NameNode
99
+
100
+ DataNode
101
+
102
+ SecondaryNameNode
103
+
104
+ ResourceManager
105
+
106
+ NodeManager
107
+
108
+ Web Interfaces:
109
+
110
+ HDFS UI: http://localhost:9870
111
+
112
+ YARN UI: http://localhost:8088
113
+
114
+ 9. Benefits
115
+
116
+ No extra hardware required
117
+
118
+ Ideal for learning and testing
119
+
120
+ Same configuration as production clusters
121
+
122
+ Fast debugging and experimentation
123
+
124
+ 10. Conclusion
125
+
126
+ Hadoop Pseudo-Distributed Mode allows users to simulate a real Hadoop cluster on a single machine. It is widely used in educational labs and development environments to test Big Data applications without requiring large infrastructure.
127
+ '''
128
+
129
+ def main():
130
+ # print("")
131
+ print(P9_CONTENT)
132
+
133
+ if __name__ == "__main__":
134
+ main()
@@ -0,0 +1,27 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "hadup"
7
+ version = "0.1.0"
8
+ description = "hadup Library made by Mannny"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = {text = "MIT"}
12
+ authors = [{name = "Manny", email = "Mannny@mannny.com"}]
13
+
14
+ [project.scripts]
15
+ p1 = "hadup.p1:main"
16
+ p2 = "hadup.p2:main"
17
+ p3 = "hadup.p3:main"
18
+ p4 = "hadup.p4:main"
19
+ p5 = "hadup.p5:main"
20
+ p6 = "hadup.p6:main"
21
+ p7 = "hadup.p7:main"
22
+ p8 = "hadup.p8:main"
23
+ p9 = "hadup.p9:main"
24
+ p10 = "hadup.p10:main"
25
+
26
+ [tool.hatch.build.targets.wheel]
27
+ packages = ["hadup"]