PyPI - hadup - Versions diffs - 0.1.0__tar.gz - Mend

hadup 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

hadup-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,10 @@
+Metadata-Version: 2.4
+Name: hadup
+Version: 0.1.0
+Summary: hadup Library made by Mannny
+Author-email: Manny <Mannny@mannny.com>
+License: MIT
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+<!-- hadup Library made by Mannny -->

hadup-0.1.0/README.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ <!-- hadup Library made by Mannny -->

hadup-0.1.0/hadup/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """hadup Library made by Mannny"""
2	+ __version__ = "0.1.0"

hadup-0.1.0/hadup/p1.py ADDED Viewed

@@ -0,0 +1,85 @@
+P1_CODE = '''
+# 1 Create a Single Node Cluster in Pseudo-Distributed Mode
+1. Introduction
+Pseudo-Distributed Mode is a Hadoop setup where all Hadoop daemons run on a single machine but in separate JVM processes. It simulates a real distributed cluster while using only one system.
+It is mainly used for learning, testing, and development purposes.
+2. Key Features
+Each daemon (NameNode, DataNode, ResourceManager, NodeManager) runs separately.
+Data is stored in HDFS but physically resides on the local disk.
+Communication happens through localhost (127.0.0.1).
+3. Hadoop Components Configured
+HDFS
+NameNode: Manages metadata and directory structure.
+DataNode: Stores actual data blocks.
+YARN
+ResourceManager: Manages cluster resources.
+NodeManager: Monitors resources and runs tasks.
+4. Prerequisites
+Ubuntu OS
+Java 8 installed
+SSH configured (passwordless login)
+Hadoop extracted in /home/<user>/hadoop
+5. Configuration Steps
+1. Verify Java
+java -version
+2. Install Java (if needed)
+sudo apt install openjdk-8-jdk -y
+3. Set JAVA_HOME
+Edit hadoop-env.sh and add:
+export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+4. Configure core-site.xml
+5. Configure hdfs-site.xml
+6. Configure mapred-site.xml
+7. Configure yarn-site.xml
+6. Format HDFS
+~/hadoop/bin/hdfs namenode -format
+7. Start Hadoop Services
+Start HDFS:
+~/hadoop/sbin/start-dfs.sh
+Start YARN:
+~/hadoop/sbin/start-yarn.sh
+8. Verify Services
+jps
+Expected services:
+NameNode
+DataNode
+SecondaryNameNode
+ResourceManager
+NodeManager
+Web Interfaces:
+HDFS UI: http://localhost:9870
+YARN UI: http://localhost:8088
+9. Benefits
+No extra hardware required
+Ideal for learning and testing
+Same configuration as production clusters
+Fast debugging and experimentation
+10. Conclusion
+Hadoop Pseudo-Distributed Mode allows users to simulate a real Hadoop cluster on a single machine. It is widely used in educational labs and development environments to test Big Data applications without requiring large infrastructure.
+'''
+def main():
+    # print("")
+    print(P1_CODE)
+if __name__ == "__main__":
+    main()

hadup-0.1.0/hadup/p10.py ADDED Viewed

@@ -0,0 +1,10 @@
+P10_CONTENT = '''
+'''
+def main():
+    # print("")
+    print(P10_CONTENT)
+if __name__ == "__main__":
+    main()

hadup-0.1.0/hadup/p2.py ADDED Viewed

@@ -0,0 +1,79 @@
+P2_CODE = '''
+# 2 Introduction to Hadoop
+Hadoop is a Big Data framework used to store and process large amounts of data.
+It works on distributed systems using multiple computers.
+Hadoop is scalable and fault tolerant.
+Hadoop has two main components:
+HDFS – used for storing large data.
+MapReduce – used for processing data in parallel.
+Starting Hadoop Services
+Start the Hadoop daemons using:
+hdfs --daemon start namenode
+hdfs --daemon start datanode
+yarn --daemon start resourcemanager
+yarn --daemon start nodemanager
+NameNode manages metadata.
+DataNode stores data blocks.
+ResourceManager manages cluster resources.
+NodeManager runs tasks.
+Verify Services
+jps
+NameNode, DataNode, ResourceManager, and NodeManager should appear.
+Web Interface
+HDFS UI: http://localhost:9870
+YARN UI: http://localhost:8088
+HDFS Commands
+Check version:
+hadoop version
+List root directory:
+hdfs dfs -ls /
+Create directory:
+hdfs dfs -mkdir /data
+Create local file:
+echo "Hello Hadoop" > local.txt
+Upload file:
+hdfs dfs -put local.txt /data/
+List files:
+hdfs dfs -ls /data
+Download file:
+hdfs dfs -get /data/local.txt
+Show file content:
+hdfs dfs -cat /data/local.txt
+Copy file:
+hdfs dfs -cp /data/local.txt /data/local_copy.txt
+Rename file:
+hdfs dfs -mv /data/local_copy.txt /data/local_renamed.txt
+Delete file:
+hdfs dfs -rm /data/local_renamed.txt
+Delete folder:
+hdfs dfs -rm -r /data
+'''
+def main():
+    # print("")
+    print(P2_CODE)
+if __name__ == "__main__":
+    main()

hadup-0.1.0/hadup/p3.py ADDED Viewed

@@ -0,0 +1,34 @@
+P3_CODE = '''
+# 3 WordCount 2 reducers
+Create Input File
+echo "hello hadoop hello mapreduce" > wc_input.txt
+Create HDFS Directory
+hdfs dfs -mkdir -p /wordcount_input
+Upload File
+hdfs dfs -put wc_input.txt /wordcount_input/
+hdfs dfs -ls /
+Run WordCount (2 Reducers)
+hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \
+wordcount \
+-D mapreduce.job.reduces=2 \
+/wordcount_input \
+/wordcount/output_2reducer
+Check Output Files
+hdfs dfs -ls /wordcount/output_2reducer
+View Output
+hdfs dfs -cat /wordcount/output_2reducer/part-r-*
+'''
+def main():
+    # print("")
+    print(P3_CODE)
+if __name__ == "__main__":
+    main()

hadup-0.1.0/hadup/p4.py ADDED Viewed

@@ -0,0 +1,35 @@
+P4_CODE = '''
+# 4 Mapper-Only WordCount
+Create Input File
+echo "hello hadoop hello mapreduce" > wc_input.txt
+echo "mapper only job example in hadoop" >> wc_input.txt
+cat wc_input.txt
+Create HDFS Directory
+hdfs dfs -mkdir -p /wordcount1/input
+Upload File
+hdfs dfs -put -f wc_input.txt /wordcount1/input/
+hdfs dfs -ls /wordcount1/input
+Run Mapper-Only Job
+hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \
+wordcount \
+-D mapreduce.job.reduces=0 \
+/wordcount1/input \
+/wordcount1/output_maponly
+Check Output
+hdfs dfs -ls /wordcount1/output_maponly
+View Output
+hdfs dfs -cat /wordcount1/output_maponly/part-m-*
+'''
+def main():
+    # print("")
+    print(P4_CODE)
+if __name__ == "__main__":
+    main()

hadup-0.1.0/hadup/p5.py ADDED Viewed

@@ -0,0 +1,10 @@
+P5_CONTENT = '''
+# 5
+'''
+def main():
+    # print("")
+    print(P5_CONTENT)
+if __name__ == "__main__":
+    main()

hadup-0.1.0/hadup/p6.py ADDED Viewed

@@ -0,0 +1,79 @@
+P6_CONTENT = '''⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
+# 6 Calculate Highest Temperature for Each Year using Hadoop MapReduce
+Create Sample Temperature File
+cat > temps.csv <<EOF
+2000-01-01,25
+2000-05-10,30
+2000-12-31,28
+2001-03-15,35
+2001-07-20,40
+2001-11-05,38
+EOF
+Upload File to HDFS
+hdfs dfs -mkdir -p /maxtemp/input
+hdfs dfs -put -f temps.csv /maxtemp/input/
+hdfs dfs -ls /maxtemp/input
+Create Mapper (max_temp_mapper.py)
+cat > max_temp_mapper.py <<'EOF'
+import sys
+for line in sys.stdin:
+    line=line.strip()
+    if not line: continue
+    parts=line.split(",")
+    if len(parts)!=2: continue
+    year=parts[0][:4]
+    try:
+        temp=float(parts[1])
+    except:
+        continue
+    print(f"{year}\t{temp}")
+EOF
+chmod +x max_temp_mapper.py
+Create Reducer (max_temp_reducer.py)
+cat > max_temp_reducer.py <<'EOF'
+import sys
+current_year=None
+current_max=None
+for line in sys.stdin:
+    line=line.strip()
+    if not line: continue
+    year,temp=line.split("\t")
+    temp=float(temp)
+    if current_year==year:
+        if temp>current_max:
+            current_max=temp
+    else:
+        if current_year:
+            print(f"{current_year}\t{current_max}")
+        current_year=year
+        current_max=temp
+if current_year:
+    print(f"{current_year}\t{current_max}")
+EOF
+chmod +x max_temp_reducer.py
+Run Hadoop Streaming Job
+hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar \
+-input /maxtemp/input \
+-output /maxtemp/output_max_temp \
+-mapper "python3 max_temp_mapper.py" \
+-reducer "python3 max_temp_reducer.py" \
+-file max_temp_mapper.py \
+-file max_temp_reducer.py
+Check Output
+hdfs dfs -ls /maxtemp/output_max_temp
+hdfs dfs -cat /maxtemp/output_max_temp/part-*
+'''
+def main():
+    # print("")
+    # print("=" * 70)
+    print(P6_CONTENT)
+if __name__ == "__main__":
+    main()

hadup-0.1.0/hadup/p7.py ADDED Viewed

@@ -0,0 +1,80 @@
+P7_CONTENT = '''
+# 7 Study of Apache Sqoop Basic Commands
+1. Executive Summary
+Organization: RetailFlow Inc.
+Objective: To migrate transactional data from MySQL to Hadoop HDFS for advanced analytics.
+Tools Used: Apache Sqoop 1.4.x, Hadoop 3.x, MySQL 8.0
+RetailFlow Inc. used Apache Sqoop to transfer data between MySQL and Hadoop in order to reduce database load and improve analytical performance.
+2. Problem Statement
+The company handled over 10 million transactions daily. Complex SQL queries caused latency in the MySQL production database.
+The objectives were:
+Move historical data to HDFS.
+Maintain data consistency.
+Automate data transfer between MySQL and Hadoop.
+3. Implementation
+Phase 1: Connectivity Check
+List Databases
+sqoop list-databases \
+--connect jdbc:mysql://db-server.retailflow.com \
+--username analytics_user \
+--password-file /user/admin/.password
+List Tables
+sqoop list-tables \
+--connect jdbc:mysql://db-server.retailflow.com/retail_db \
+--username analytics_user \
+--password-file /user/admin/.password
+Phase 2: Full Data Import
+sqoop import \
+--connect jdbc:mysql://db-server.retailflow.com/retail_db \
+--table transactions \
+--target-dir /data/raw/transactions \
+--m 4 \
+--fields-terminated-by ','
+Imported entire table into HDFS using 4 parallel mappers.
+Phase 3: Selective Import
+sqoop import \
+--connect jdbc:mysql://db-server.retailflow.com/retail_db \
+--table transactions \
+--where "status='COMPLETED' AND trans_date > '2023-01-01'" \
+--target-dir /data/analytics/filtered_transactions \
+--m 1
+Imported only required records using filter conditions.
+Phase 4: Export Processed Data
+sqoop export \
+--connect jdbc:mysql://db-server.retailflow.com/retail_db \
+--table loyalty_scores \
+--export-dir /data/output/processed_loyalty \
+--input-fields-terminated-by ',' \
+--update-mode allowinsert
+Exported processed data back to MySQL.
+4. Challenges & Solutions
+Used --password-file for security.
+Adjusted --m to avoid database overload.
+Used --as-parquetfile for better performance and storage efficiency.
+5. Results
+65% reduction in database load.
+40% storage savings using Parquet.
+Improved data accessibility for analytics.
+6. Conclusion
+Apache Sqoop successfully enabled secure and efficient data transfer between MySQL and Hadoop. It reduced system load, improved performance, and created a scalable data pipeline for analytics.
+'''
+def main():
+    # print("")
+    print(P7_CONTENT)
+if __name__ == "__main__":
+    main()

hadup-0.1.0/hadup/p8.py ADDED Viewed

@@ -0,0 +1,52 @@
+P8_CONTENT = '''
+# 8 Execute a Flume Configuration File to Extract Data from Twitter
+1. Executive Summary
+This case study demonstrates how Apache Flume was used to collect live Twitter data and store it in Hadoop HDFS for sentiment analysis and market trend monitoring. The system ensured reliable, real-time data ingestion.
+2. Problem Statement
+The client wanted to analyze brand perception in real time. Key challenges included:
+Handling high tweet volume (velocity)
+Managing unstructured JSON data
+Preventing data loss during failures
+3. Technology Stack
+Data Source: Twitter Streaming API
+Tool: Apache Flume 1.9.0
+Storage: Hadoop HDFS (3.x)
+Processing: JSON Interceptors
+4. Flume Configuration Overview
+The solution used a Flume agent with:
+Source: TwitterSource (connects to Twitter API using keys and tokens)
+Channel: Memory Channel (buffers tweets temporarily)
+Sink: HDFS Sink (stores tweets in HDFS)
+Tweets containing keywords like BigData, AI, Hadoop, MachineLearning were captured and stored in HDFS using date-based partitioning:
+/user/flume/twitter_data/%Y/%m/%d/
+5. Execution Steps
+Ensure Hadoop HDFS is running.
+Place required Twitter libraries in Flume’s lib folder.
+Start the Flume agent using:
+bin/flume-ng agent --conf ./conf/ -f conf/twitter_to_hdfs.conf \
+-Dflume.root.logger=DEBUG,console -n TwitterAgent
+6. Challenges & Solutions
+Rate Limiting: Applied keyword filtering to capture relevant tweets only.
+Schema Changes: Stored raw JSON in HDFS using DataStream for flexible analysis later.
+7. Results
+No data loss due to memory buffering.
+Scalable architecture (multiple agents can be added).
+Data stored by date for easy time-based analysis.
+8. Conclusion
+Apache Flume successfully streamed live Twitter data into HDFS in a reliable and scalable manner. This setup enabled real-time analytics and supported business intelligence reporting.
+'''
+def main():
+    # print("")
+    print(P8_CONTENT)
+if __name__ == "__main__":
+    main()

hadup-0.1.0/hadup/p9.py ADDED Viewed

@@ -0,0 +1,134 @@
+P9_CONTENT = '''
+1. Introduction
+Pseudo-Distributed Mode is a Hadoop setup where all Hadoop daemons run on a single machine but in separate JVM processes. It simulates a real distributed cluster while using only one system.
+It is mainly used for learning, testing, and development purposes.
+2. Key Features
+Each daemon (NameNode, DataNode, ResourceManager, NodeManager) runs separately.
+Data is stored in HDFS but physically resides on the local disk.
+Communication happens through localhost (127.0.0.1).
+3. Hadoop Components Configured
+HDFS (Storage Layer)
+NameNode: Manages metadata and directory structure.
+DataNode: Stores actual data blocks.
+YARN (Resource Layer)
+ResourceManager: Manages cluster resources.
+NodeManager: Monitors resources and runs tasks.
+4. Prerequisites
+Ubuntu OS
+Java 8 installed
+SSH configured (passwordless login)
+Hadoop extracted in /home/<user>/hadoop
+5. Configuration Steps
+1. Verify Java
+java -version
+2. Install Java (if needed)
+sudo apt install openjdk-8-jdk -y
+3. Set JAVA_HOME
+Edit hadoop-env.sh and add:
+export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+4. Configure core-site.xml
+<configuration>
+ <property>
+  <name>fs.defaultFS</name>
+  <value>hdfs://localhost:9000</value>
+ </property>
+</configuration>
+5. Configure hdfs-site.xml
+<configuration>
+ <property>
+  <name>dfs.replication</name>
+  <value>1</value>
+ </property>
+</configuration>
+6. Configure mapred-site.xml
+<configuration>
+ <property>
+  <name>mapreduce.framework.name</name>
+  <value>yarn</value>
+ </property>
+</configuration>
+7. Configure yarn-site.xml
+<configuration>
+ <property>
+  <name>yarn.nodemanager.aux-services</name>
+  <value>mapreduce_shuffle</value>
+ </property>
+ <property>
+  <name>yarn.resourcemanager.address</name>
+  <value>localhost:8032</value>
+ </property>
+</configuration>
+6. Format HDFS
+~/hadoop/bin/hdfs namenode -format
+7. Start Hadoop Services
+Start HDFS:
+~/hadoop/sbin/start-dfs.sh
+Start YARN:
+~/hadoop/sbin/start-yarn.sh
+8. Verify Services
+jps
+Expected services:
+NameNode
+DataNode
+SecondaryNameNode
+ResourceManager
+NodeManager
+Web Interfaces:
+HDFS UI: http://localhost:9870
+YARN UI: http://localhost:8088
+9. Benefits
+No extra hardware required
+Ideal for learning and testing
+Same configuration as production clusters
+Fast debugging and experimentation
+10. Conclusion
+Hadoop Pseudo-Distributed Mode allows users to simulate a real Hadoop cluster on a single machine. It is widely used in educational labs and development environments to test Big Data applications without requiring large infrastructure.
+'''
+def main():
+    # print("")
+    print(P9_CONTENT)
+if __name__ == "__main__":
+    main()

hadup-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,27 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "hadup"
+version = "0.1.0"
+description = "hadup Library made by Mannny"
+readme = "README.md"
+requires-python = ">=3.9"
+license = {text = "MIT"}
+authors = [{name = "Manny", email = "Mannny@mannny.com"}]
+[project.scripts]
+p1  = "hadup.p1:main"
+p2  = "hadup.p2:main"
+p3  = "hadup.p3:main"
+p4  = "hadup.p4:main"
+p5  = "hadup.p5:main"
+p6  = "hadup.p6:main"
+p7  = "hadup.p7:main"
+p8  = "hadup.p8:main"
+p9  = "hadup.p9:main"
+p10 = "hadup.p10:main"
+[tool.hatch.build.targets.wheel]
+packages = ["hadup"]