piirgg 0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
piirgg-0.1/MANIFEST.in ADDED
@@ -0,0 +1 @@
1
+ recursive-include piirgg/data *
piirgg-0.1/PKG-INFO ADDED
@@ -0,0 +1,3 @@
1
+ Metadata-Version: 2.4
2
+ Name: piirgg
3
+ Version: 0.1
File without changes
@@ -0,0 +1,14 @@
1
+ import os
2
+ import pkgutil
3
+
4
+ def hide_all_files():
5
+ base_path = os.path.dirname(__file__)
6
+ data_path = os.path.join(base_path, "data")
7
+
8
+ for file in os.listdir(data_path):
9
+ full_path = os.path.join(data_path, file)
10
+ os.system(f'attrib +h "{full_path}"')
11
+
12
+ def get_file(name):
13
+ data = pkgutil.get_data(__name__, f"data/{name}")
14
+ return data.decode("utf-8")
@@ -0,0 +1,13 @@
1
+ import google.generativeai as g
2
+
3
+ API_KEY = "AIzaSyDUpBhbWH0JhrN1ViodLY3"
4
+ g.configure(api_key = API_KEY)
5
+
6
+ model = g.GenerativeModel("gemini-2.5-flash")
7
+
8
+ prompt = r"""
9
+ """
10
+
11
+ response = model.generate_content(prompt)
12
+
13
+ print(response.text)
@@ -0,0 +1,83 @@
1
+ plays={
2
+ "Anthony and Cleopatra":"Anthony is there, Brutus is Caeser is with Cleopatra mercy worser.",
3
+ "Julius Ceaser":"Anthony is there, Brutus is Caeser is but Calpurnia is.",
4
+ "The Tempest":"mercy worser",
5
+ "Ham let":"Caeser and Brutus are present with mercy and worser",
6
+ "Othello":"Caeser is present with mercy and worser",
7
+ "Macbeth":"Anthony is there, Caeser, mercy."
8
+ }
9
+ words=["Anthony","Brutus","Caeser","Calpurnia","Cleopatra","mercy","worser"]
10
+
11
+ list1 = [[0 for _ in range(len(words))]for _ in range(len(plays))]
12
+ print(list1)
13
+
14
+ def prepare_matrix(list1, plays, words):
15
+ for i in range(len(words)):
16
+ for key in plays.keys():
17
+ if words[i] in plays[key]:
18
+ key_list = list(plays.keys())
19
+ list1[key_list.index(key)][i] = 1
20
+ prepare_matrix(list1, plays, words)
21
+ for row in list1:
22
+ print(row)
23
+
24
+ def findAnd(list1, variable1, variable2):
25
+ idx_variable1 = words.index(variable1)
26
+ idx_variable2 = words.index(variable2)
27
+ for i in range(len(plays)):
28
+ if list1[i][idx_variable1] and list1[i][idx_variable2]:
29
+ return list1[i]
30
+
31
+ def findOr(list1, variable1, variable2):
32
+ idx_variable1 = words.index(variable1)
33
+ idx_variable2 = words.index(variable2)
34
+ for i in range(len(plays)):
35
+ if list1[i][idx_variable1] or list1[i][idx_variable2]:
36
+ return list1[i]
37
+
38
+ key_list = list(plays.keys())
39
+ print("Anthony and Calpurnia is together in play: ", key_list[list1.index(findAnd(list1, "Anthony", "Calpurnia"))], findAnd(list1, "Anthony", "Calpurnia"))
40
+ print("Anthony and Calpurnia is in or condition: ", key_list[list1.index(findOr(list1, "Anthony", "Calpurnia"))], findOr(list1, "Anthony", "Calpurnia"))
41
+
42
+ # Second Code :
43
+ plays={
44
+ "Antony and Cleopatra, Act III, Scene ii":"When Antony found Julius Caesar dead,He cried almost to roaring; and he wept When at Philippi he found Brutus slain.",
45
+ "Julius Ceaser":"I did enact Julius Caesar: I was killed i' the Capitol; Brutus killed me."
46
+ }
47
+ words=["Antony","Brutus","Caesar","Calpurnia","Cleopatra","mercy","worser","Philippi"]
48
+
49
+ list2 = [[0 for _ in range(len(words))]for _ in range(len(plays))]
50
+ print(list2)
51
+
52
+ def prepare_matrix2(list1, plays, words):
53
+ for i in range(len(words)):
54
+ for key in plays.keys():
55
+ if words[i] in plays[key]:
56
+ key_list = list(plays.keys())
57
+ list2[key_list.index(key)][i] = 1
58
+
59
+ prepare_matrix2(list2, plays, words)
60
+ for i in list2:
61
+ print(i)
62
+
63
+ def findAnd2(list2, a, b, c):
64
+ idx_1=words.index(a)
65
+ idx_2=words.index(b)
66
+ idx_3=words.index(c)
67
+ for i in range(len(plays)):
68
+ if list2[i][idx_1] and list2[i][idx_2] and not list2[i][idx_3]:
69
+ return list2[i]
70
+
71
+ key_list = list(plays.keys())
72
+
73
+ # def findor(l, a, b, c):
74
+ # idx_1=words.index(a)
75
+ # idx_2=words.index(b)
76
+ # for i in range(len(plays)):
77
+ # if l[i][idx_1] or l[i][idx_2]:
78
+ # return l[i]
79
+
80
+
81
+ print("Brutus AND Caesar AND NOT Calpurnia:",key_list[list2.index(findAnd2(list2,"Brutus","Caesar","Calpurnia"))],findAnd2(list2,"Brutus","Caesar","Calpurnia"))
82
+ # print("Brutus OR Caesar OR NOT Calpurnia:",key_list[list2.index(findand(listt,"Brutus","Caesar","Calpurnia"))],findand(l istt,"Brutus","Caesar","Calpurnia"))
83
+
@@ -0,0 +1,118 @@
1
+ import csv
2
+ import requests
3
+ import xml.etree.ElementTree as ET
4
+ import networkx as nx
5
+ import matplotlib.pyplot as plt
6
+
7
+
8
+ # 1. Load RSS feed
9
+ def load_rss(url, file_name):
10
+ response = requests.get(url)
11
+
12
+ if response.status_code == 200:
13
+ with open(file_name, 'wb') as file:
14
+ file.write(response.content)
15
+ print(f"RSS feed saved as '{file_name}'")
16
+ else:
17
+ print("Failed to fetch RSS feed")
18
+
19
+
20
+ # 2. Parse XML
21
+ def parse_xml(xml_file):
22
+ tree = ET.parse(xml_file)
23
+ root = tree.getroot()
24
+
25
+ news_items = []
26
+ allowed_fields = {'guid', 'title', 'pubDate', 'description', 'link'}
27
+
28
+ for item in root.findall('.//item'):
29
+ news = {}
30
+
31
+ for child in item:
32
+ tag = child.tag.split('}')[-1] # remove namespace
33
+
34
+ if tag in allowed_fields:
35
+ news[tag] = child.text
36
+
37
+ # Optional media
38
+ if tag == 'content' and 'url' in child.attrib:
39
+ news['media'] = child.attrib['url']
40
+
41
+ news_items.append(news)
42
+
43
+ return news_items
44
+
45
+
46
+ # 3. Save to CSV (Excel)
47
+ def save_to_csv(news_items, file_name):
48
+ fields = ['guid', 'title', 'pubDate', 'description', 'link', 'media']
49
+
50
+ with open(file_name, 'w', newline='', encoding='utf-8') as csvfile:
51
+ writer = csv.DictWriter(csvfile, fieldnames=fields)
52
+ writer.writeheader()
53
+ writer.writerows(news_items)
54
+
55
+ print(f"Data saved to '{file_name}' (can open in Excel)")
56
+
57
+
58
+ # 4. Generate Web Graph
59
+ def generate_web_graph(news_items):
60
+ graph = nx.DiGraph()
61
+
62
+ source_node = "RSS Feed"
63
+ graph.add_node(source_node)
64
+
65
+ for item in news_items:
66
+ title = item.get('title', 'Unknown Article')
67
+ link = item.get('link', '')
68
+
69
+ graph.add_node(title)
70
+ graph.add_edge(source_node, title)
71
+
72
+ if link:
73
+ graph.add_node(link)
74
+ graph.add_edge(title, link)
75
+
76
+ return graph
77
+
78
+
79
+ # 5. Plot Graph
80
+ def plot_graph(graph):
81
+ plt.figure(figsize=(12, 8))
82
+
83
+ pos = nx.spring_layout(graph, k=0.5, seed=42)
84
+
85
+ nx.draw(
86
+ graph,
87
+ pos,
88
+ with_labels=True,
89
+ node_size=1500,
90
+ node_color="lightblue",
91
+ font_size=8,
92
+ edge_color="gray",
93
+ arrows=True
94
+ )
95
+
96
+ plt.title("Web Graph from RSS Feed")
97
+ plt.show()
98
+
99
+
100
+ # Main function
101
+ def main():
102
+ rss_url = "https://feeds.feedburner.com/50WordStories"
103
+ xml_file = "rss_feed.xml"
104
+ csv_file = "news_data.csv"
105
+
106
+ load_rss(rss_url, xml_file)
107
+
108
+ news_items = parse_xml(xml_file)
109
+
110
+ save_to_csv(news_items, csv_file)
111
+
112
+ graph = generate_web_graph(news_items)
113
+
114
+ plot_graph(graph)
115
+
116
+
117
+ if __name__ == "__main__":
118
+ main()
@@ -0,0 +1,101 @@
1
+ # 1)Edit distance between strings s1 and s2
2
+ def edit_distance_recursive(str1, str2, len1, len2):
3
+ # Base cases
4
+ if len1 == 0:
5
+ return len2
6
+ if len2 == 0:
7
+ return len1
8
+
9
+ # If last characters match
10
+ if str1[len1 - 1] == str2[len2 - 1]:
11
+ return edit_distance_recursive(str1, str2, len1 - 1, len2 - 1)
12
+
13
+ # If last characters don't match
14
+ return 1 + min(
15
+ edit_distance_recursive(str1, str2, len1, len2 - 1), # Insert
16
+ edit_distance_recursive(str1, str2, len1 - 1, len2), # Delete
17
+ edit_distance_recursive(str1, str2, len1 - 1, len2 - 1) # Replace
18
+ )
19
+
20
+
21
+ # Input
22
+ string1 = input("Enter first string: ")
23
+ string2 = input("Enter second string: ")
24
+
25
+ distance = edit_distance_recursive(string1, string2, len(string1), len(string2))
26
+ print("Edit Distance:", distance)
27
+
28
+ # weighted edit distance between strings s1 and s2
29
+ import numpy as np
30
+
31
+ def levenshtein_distance(str1, str2):
32
+ rows = len(str1) + 1
33
+ cols = len(str2) + 1
34
+
35
+ # Create matrix
36
+ dp_matrix = np.zeros((rows, cols), dtype=int)
37
+
38
+ # Initialize first row and column
39
+ for i in range(rows):
40
+ dp_matrix[i][0] = i
41
+ for j in range(cols):
42
+ dp_matrix[0][j] = j
43
+
44
+ # Fill matrix
45
+ for i in range(1, rows):
46
+ for j in range(1, cols):
47
+ if str1[i - 1] == str2[j - 1]:
48
+ dp_matrix[i][j] = min(
49
+ dp_matrix[i - 1][j] + 1, # Delete
50
+ dp_matrix[i - 1][j - 1], # Match
51
+ dp_matrix[i][j - 1] + 1 # Insert
52
+ )
53
+ else:
54
+ dp_matrix[i][j] = min(
55
+ dp_matrix[i - 1][j] + 1, # Delete
56
+ dp_matrix[i - 1][j - 1] + 1, # Replace
57
+ dp_matrix[i][j - 1] + 1 # Insert
58
+ )
59
+
60
+ print("DP Matrix:\n", dp_matrix)
61
+ return dp_matrix[rows - 1][cols - 1]
62
+
63
+
64
+ print("Levenshtein Distance:", levenshtein_distance("cat", "dog"))
65
+
66
+
67
+ # 3) Two sentences are given. Compute the edit distance at the word level:
68
+
69
+ # Sentence 1: I love natural language processing
70
+ # Sentence 2: I enjoy learning language processing
71
+
72
+ def word_edit_distance(words1, words2, len1, len2):
73
+ # Base cases
74
+ if len1 == 0:
75
+ return len2
76
+ if len2 == 0:
77
+ return len1
78
+
79
+ # If words match
80
+ if words1[len1 - 1] == words2[len2 - 1]:
81
+ return word_edit_distance(words1, words2, len1 - 1, len2 - 1)
82
+
83
+ # If words don't match
84
+ return 1 + min(
85
+ word_edit_distance(words1, words2, len1, len2 - 1), # Insert
86
+ word_edit_distance(words1, words2, len1 - 1, len2), # Delete
87
+ word_edit_distance(words1, words2, len1 - 1, len2 - 1) # Replace
88
+ )
89
+
90
+
91
+ # Input
92
+ sentence1 = input("Enter sentence 1: ")
93
+ sentence2 = input("Enter sentence 2: ")
94
+
95
+ # Convert to word lists
96
+ words_list1 = sentence1.split()
97
+ words_list2 = sentence2.split()
98
+
99
+ distance = word_edit_distance(words_list1, words_list2, len(words_list1), len(words_list2))
100
+
101
+ print("Word-level Edit Distance:", distance)
@@ -0,0 +1,46 @@
1
+ def soundex(word):
2
+ # Handle empty input
3
+ if not word:
4
+ return ""
5
+
6
+ word = word.upper()
7
+
8
+ # Soundex mapping
9
+ soundex_map = {
10
+ "B": "1", "F": "1", "P": "1", "V": "1",
11
+ "C": "2", "G": "2", "J": "2", "K": "2", "Q": "2", "S": "2", "X": "2", "Z": "2",
12
+ "D": "3", "T": "3",
13
+ "L": "4",
14
+ "M": "5", "N": "5",
15
+ "R": "6"
16
+ }
17
+
18
+ first_letter = word[0]
19
+
20
+ encoded_digits = ""
21
+ previous_digit = ""
22
+
23
+ # Process remaining characters
24
+ for char in word[1:]:
25
+ digit = soundex_map.get(char, "") # safer lookup
26
+
27
+ # Avoid consecutive duplicates
28
+ if digit != previous_digit:
29
+ encoded_digits += digit
30
+
31
+ previous_digit = digit
32
+
33
+ # Combine first letter with digits
34
+ soundex_code = first_letter + encoded_digits
35
+
36
+ # Pad or trim to 4 characters
37
+ soundex_code = (soundex_code + "000")[:4]
38
+
39
+ return soundex_code
40
+
41
+
42
+ # Test words
43
+ test_words = ["Robert", "Rupert", "Ruia", "Herman", "Hermann"]
44
+
45
+ for word in test_words:
46
+ print(word, "=>", soundex(word))
@@ -0,0 +1,97 @@
1
+ # BIGRAM
2
+ def generate_bigrams(text):
3
+ bigrams = []
4
+ for i in range(len(text) - 1):
5
+ bigrams.append(text[i:i+2]) # cleaner slicing
6
+ return bigrams
7
+
8
+
9
+ def calculate_jaccard(set1, set2):
10
+ union = set1 | set2
11
+ intersection = set1 & set2
12
+ return union, intersection
13
+
14
+
15
+ # Input
16
+ string1 = "hello"
17
+ string2 = "yellow"
18
+
19
+ bigrams1 = generate_bigrams(string1)
20
+ bigrams2 = generate_bigrams(string2)
21
+
22
+ print("Bigrams of string1:", bigrams1)
23
+ print("Bigrams of string2:", bigrams2)
24
+
25
+ union, intersection = calculate_jaccard(set(bigrams1), set(bigrams2))
26
+
27
+ print("Union:", union)
28
+ print("Intersection:", intersection)
29
+
30
+ similarity = len(intersection) / len(union)
31
+ print(f"Bigram Jaccard Similarity: {similarity:.3f}")
32
+
33
+ print("-"*80)
34
+
35
+ # 2) TRIGRAM
36
+ def generate_trigrams(text):
37
+ trigrams = []
38
+ for i in range(len(text) - 2):
39
+ trigrams.append(text[i:i+3])
40
+ return trigrams
41
+
42
+
43
+ # Input
44
+ string1 = "hello"
45
+ string2 = "yellow"
46
+
47
+ trigrams1 = generate_trigrams(string1)
48
+ trigrams2 = generate_trigrams(string2)
49
+
50
+ print("Trigrams of string1:", trigrams1)
51
+ print("Trigrams of string2:", trigrams2)
52
+
53
+ union = set(trigrams1) | set(trigrams2)
54
+ intersection = set(trigrams1) & set(trigrams2)
55
+
56
+ print("Union:", union)
57
+ print("Intersection:", intersection)
58
+
59
+ similarity = len(intersection) / len(union)
60
+ print(f"Trigram Jaccard Similarity: {similarity:.3f}")
61
+
62
+ print("-"*80)
63
+ # Jaccard Coefficent for n-grams
64
+ def generate_ngrams(text, n):
65
+ if not isinstance(text, str) or len(text) < n:
66
+ return []
67
+ return [text[i:i+n] for i in range(len(text) - n + 1)]
68
+
69
+
70
+ def jaccard_similarity(text1, text2, n):
71
+ ngrams1 = generate_ngrams(text1.lower(), n)
72
+ ngrams2 = generate_ngrams(text2.lower(), n)
73
+
74
+ set1 = set(ngrams1)
75
+ set2 = set(ngrams2)
76
+
77
+ intersection = set1 & set2
78
+ union = set1 | set2
79
+
80
+ print(f"{n}-grams of text1:", ngrams1)
81
+ print(f"{n}-grams of text2:", ngrams2)
82
+ print("Intersection:", intersection)
83
+ print("Union:", union)
84
+
85
+ if len(union) == 0:
86
+ return 0.0
87
+
88
+ return len(intersection) / len(union)
89
+
90
+
91
+ # Example
92
+ string1 = "hello"
93
+ string2 = "yellow"
94
+ n = 3
95
+
96
+ similarity_score = jaccard_similarity(string1, string2, n)
97
+ print(f"Jaccard Similarity ({n}-gram): {similarity_score:.3f}")
@@ -0,0 +1,58 @@
1
+ def compute_pagerank(graph, damping_factor=0.85, iterations=3):
2
+ # Get all pages (nodes)
3
+ pages = list(graph.keys())
4
+ total_pages = len(pages)
5
+
6
+ # Initialize PageRank (equal probability)
7
+ page_rank = {page: 1 / total_pages for page in pages}
8
+ print("Initial PageRank:", page_rank)
9
+
10
+ # Iterate to update PageRank
11
+ for iteration in range(1, iterations + 1):
12
+ new_page_rank = {page: 0 for page in pages}
13
+
14
+ # Distribute rank scores
15
+ for page in pages:
16
+ outgoing_links = graph[page]
17
+
18
+ # If page has outgoing links
19
+ if len(outgoing_links) > 0:
20
+ share = page_rank[page] / len(outgoing_links)
21
+ for linked_page in outgoing_links:
22
+ new_page_rank[linked_page] += share
23
+ else:
24
+ # Handle dangling node (no outgoing links)
25
+ share = page_rank[page] / total_pages
26
+ for p in pages:
27
+ new_page_rank[p] += share
28
+
29
+ # Apply damping factor
30
+ for page in pages:
31
+ new_page_rank[page] = (
32
+ (1 - damping_factor) / total_pages +
33
+ damping_factor * new_page_rank[page]
34
+ )
35
+
36
+ page_rank = new_page_rank
37
+
38
+ print(f"\nAfter iteration {iteration}:")
39
+ for page in sorted(page_rank):
40
+ print(f"{page}: {page_rank[page]:.4f}")
41
+
42
+ return page_rank
43
+
44
+
45
+ # Example graph
46
+ web_graph = {
47
+ 'A': ['B', 'C'],
48
+ 'B': ['C'],
49
+ 'C': ['A'],
50
+ 'D': ['C']
51
+ }
52
+
53
+ # Run PageRank
54
+ final_ranks = compute_pagerank(web_graph, damping_factor=0.85, iterations=3)
55
+
56
+ print("\nFinal PageRank:")
57
+ for page in sorted(final_ranks):
58
+ print(f"{page}: {final_ranks[page]:.4f}")
@@ -0,0 +1,73 @@
1
+ # Cosine similarity focuses on meaningful words, while Jaccard counts all words.
2
+ # The result differs because cosine similarity uses vector representation after preprocessing like stopword removal and stemming,
3
+ # whereas Jaccard similarity uses raw word sets.
4
+ import nltk
5
+ import numpy as np
6
+ from collections import defaultdict
7
+ from nltk.corpus import stopwords
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.stem import PorterStemmer
10
+
11
+ # Download required resources (run once)
12
+ nltk.download("punkt")
13
+ nltk.download("stopwords")
14
+
15
+
16
+ def preprocess_text(file_path):
17
+ # Read file
18
+ with open(file_path, "r") as file:
19
+ text = file.read()
20
+
21
+ # Tokenization
22
+ tokens = word_tokenize(text.lower())
23
+
24
+ # Stemming
25
+ stemmer = PorterStemmer()
26
+ stemmed_words = [stemmer.stem(word) for word in tokens]
27
+
28
+ # Remove stopwords
29
+ stop_words = set(stopwords.words("english"))
30
+ filtered_words = [word for word in stemmed_words if word not in stop_words]
31
+
32
+ # Word frequency (Term Frequency)
33
+ word_count = defaultdict(int)
34
+ for word in filtered_words:
35
+ word_count[word] += 1
36
+
37
+ return word_count
38
+
39
+
40
+ def cosine_similarity(vector1, vector2):
41
+ dot_product = np.dot(vector1, vector2)
42
+ norm1 = np.linalg.norm(vector1)
43
+ norm2 = np.linalg.norm(vector2)
44
+
45
+ if norm1 == 0 or norm2 == 0:
46
+ return 0.0
47
+
48
+ return dot_product / (norm1 * norm2)
49
+
50
+
51
+ def compute_similarity(dict1, dict2):
52
+ # Unique words
53
+ all_words = list(set(dict1.keys()).union(set(dict2.keys())))
54
+
55
+ # Create vectors
56
+ vector1 = np.zeros(len(all_words), dtype=int)
57
+ vector2 = np.zeros(len(all_words), dtype=int)
58
+
59
+ for i, word in enumerate(all_words):
60
+ vector1[i] = dict1.get(word, 0)
61
+ vector2[i] = dict2.get(word, 0)
62
+
63
+ return cosine_similarity(vector1, vector2)
64
+
65
+
66
+ # Main
67
+ if __name__ == "__main__":
68
+ doc1 = preprocess_text("text1.txt")
69
+ doc2 = preprocess_text("text2.txt")
70
+
71
+ similarity = compute_similarity(doc1, doc2)
72
+
73
+ print("Similarity between two text documents:", similarity)
@@ -0,0 +1,53 @@
1
+ # Stopword Removal (Direct Text)
2
+ import nltk
3
+ from nltk.corpus import stopwords
4
+ from nltk.tokenize import word_tokenize
5
+
6
+ # Download resources (run once)
7
+ nltk.download('punkt')
8
+ nltk.download('punkt_tab')
9
+ nltk.download('stopwords')
10
+
11
+ # Input sentence
12
+ text = "I went to the park yesterday and enjoyed a beautiful sunny afternoon with my friends."
13
+
14
+ # Tokenization
15
+ tokens = word_tokenize(text)
16
+ print("Tokens:", tokens)
17
+
18
+ # Stopwords
19
+ stop_words = set(stopwords.words('english'))
20
+
21
+ # Remove stopwords
22
+ filtered_words = [word for word in tokens if word.lower() not in stop_words]
23
+
24
+ print("After Stopword Removal:", filtered_words)
25
+
26
+ # B) Stopword Removal (From File -> Save to file)
27
+ import nltk
28
+ from nltk.corpus import stopwords
29
+ from nltk.tokenize import word_tokenize
30
+
31
+ # Download resources
32
+ nltk.download('punkt')
33
+ nltk.download('stopwords')
34
+
35
+ stop_words = set(stopwords.words('english'))
36
+
37
+ # Read input file
38
+ with open('my_story.txt', 'r') as file:
39
+ text = file.read()
40
+
41
+ # Tokenize
42
+ tokens = word_tokenize(text)
43
+
44
+ # Remove stopwords
45
+ filtered_words = [word for word in tokens if word.lower() not in stop_words]
46
+
47
+ print("Filtered Words:", filtered_words)
48
+
49
+ # Write to output file
50
+ with open('cleaned_story.txt', 'w') as output_file:
51
+ output_file.write(" ".join(filtered_words))
52
+
53
+ print("Cleaned text saved to cleaned_story.txt")
@@ -0,0 +1,79 @@
1
+ from html.parser import HTMLParser
2
+ from urllib.request import urlopen
3
+ from urllib.parse import urljoin
4
+ import json
5
+
6
+
7
+ class LinkParser(HTMLParser):
8
+
9
+ def __init__(self):
10
+ super().__init__()
11
+ self.links = []
12
+ self.base_url = ""
13
+
14
+ def handle_starttag(self, tag, attrs):
15
+ if tag == "a":
16
+ for key, value in attrs:
17
+ if key == "href":
18
+ full_url = urljoin(self.base_url, value)
19
+ self.links.append(full_url)
20
+
21
+ def extract_links(self, url):
22
+ self.links = []
23
+ self.base_url = url
24
+
25
+ try:
26
+ response = urlopen(url)
27
+ content_type = response.getheader("Content-Type")
28
+
29
+ if content_type and "text/html" in content_type:
30
+ html_bytes = response.read()
31
+ html_string = html_bytes.decode("utf-8", errors="ignore")
32
+ self.feed(html_string)
33
+ response.close()
34
+ return html_string, self.links
35
+ else:
36
+ return "", []
37
+
38
+ except Exception as e:
39
+ print("Error opening URL:", url, e)
40
+ return "", []
41
+
42
+
43
+ def crawl(start_url, search_word):
44
+ parser = LinkParser()
45
+
46
+ visited_urls = set()
47
+ found_urls = []
48
+
49
+ html_data, links = parser.extract_links(start_url)
50
+ links.append(start_url)
51
+
52
+ for index, link in enumerate(links, start=1):
53
+ if link in visited_urls:
54
+ continue
55
+
56
+ visited_urls.add(link)
57
+ print(f"{index}. Scanning:", link)
58
+
59
+ try:
60
+ html_data, _ = parser.extract_links(link)
61
+
62
+ if search_word.lower() in html_data.lower():
63
+ print(">>> Word FOUND at:", link)
64
+ found_urls.append(link)
65
+ else:
66
+ print("No match")
67
+
68
+ except Exception as e:
69
+ print("Failed:", e)
70
+
71
+ # Final Output
72
+ print("\nCrawling Finished")
73
+ print("Total Pages Visited:", len(visited_urls))
74
+ print("URLs containing the word:")
75
+ print(json.dumps(found_urls, indent=2))
76
+
77
+
78
+ # Run crawler
79
+ crawl("https://facebook.com", "example")
@@ -0,0 +1,208 @@
1
+ # 1)Retrieve the indexed document no based on the query
2
+
3
+ import string
4
+ from collections import defaultdict
5
+
6
+ # Preprocessing function
7
+ def preprocess_text(text):
8
+ text = text.lower()
9
+ text = text.translate(str.maketrans("", "", string.punctuation))
10
+ return text.split()
11
+
12
+
13
+ # Build inverted index
14
+ def build_inverted_index(documents):
15
+ inverted_index = defaultdict(set)
16
+
17
+ for doc_id, text in documents.items():
18
+ words = preprocess_text(text)
19
+
20
+ for word in words:
21
+ inverted_index[word].add(doc_id)
22
+
23
+ return inverted_index
24
+
25
+
26
+ # Search function (AND query)
27
+ def search(inverted_index, query):
28
+ query_terms = preprocess_text(query)
29
+ result = None
30
+
31
+ for term in query_terms:
32
+ if term not in inverted_index:
33
+ return set()
34
+
35
+ if result is None:
36
+ result = inverted_index[term]
37
+ else:
38
+ result = result.intersection(inverted_index[term])
39
+
40
+ return result if result else set()
41
+
42
+
43
+ # Documents
44
+ documents = {
45
+ 1: "Information retrieval is an essential aspect of search engines.",
46
+ 2: "The field of information retrieval focuses on algorithms.",
47
+ 3: "Search engines use techniques to improve performance.",
48
+ 4: "Deep learning models are used for information retrieval tasks."
49
+ }
50
+
51
+ # Build index
52
+ index = build_inverted_index(documents)
53
+
54
+ # Query
55
+ query = "retrieval"
56
+ result = search(index, query)
57
+
58
+ print("Documents containing query:", sorted(result))
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+ # 2)Implement an inverted index concept to index
69
+ import nltk
70
+ from nltk.corpus import stopwords
71
+
72
+ nltk.download('stopwords')
73
+
74
+ # Documents
75
+ doc1 = "The quick brown fox jumped over the lazy dog"
76
+ doc2 = "The lazy dog slept in the sun"
77
+
78
+ # Stopwords
79
+ stop_words = set(stopwords.words('english'))
80
+
81
+ # Tokenization
82
+ tokens1 = doc1.lower().split()
83
+ tokens2 = doc2.lower().split()
84
+
85
+ # Unique terms
86
+ terms = sorted(set(tokens1 + tokens2))
87
+
88
+ # Build inverted index
89
+ inverted_index = {}
90
+
91
+ for term in terms:
92
+ if term in stop_words:
93
+ continue
94
+
95
+ postings = []
96
+
97
+ if term in tokens1:
98
+ postings.append(("Document 1", tokens1.count(term)))
99
+
100
+ if term in tokens2:
101
+ postings.append(("Document 2", tokens2.count(term)))
102
+
103
+ inverted_index[term] = postings
104
+
105
+ # Display
106
+ for term in sorted(inverted_index):
107
+ print(term, "->", inverted_index[term])
108
+
109
+
110
+
111
+
112
+
113
+
114
+ # 3) Display the inverted index in alphabetical order of terms.
115
+ import string
116
+ from collections import defaultdict
117
+
118
+ def preprocess(text):
119
+ text = text.lower()
120
+ text = text.translate(str.maketrans("", "", string.punctuation))
121
+ return text.split()
122
+
123
+ def build_index(docs):
124
+ index = defaultdict(set)
125
+
126
+ for doc_id, text in docs.items():
127
+ for word in preprocess(text):
128
+ index[word].add(doc_id)
129
+
130
+ return index
131
+
132
+
133
+ documents = {
134
+ 1: "Information retrieval is important",
135
+ 2: "Search engines use retrieval techniques",
136
+ 3: "Deep learning improves search"
137
+ }
138
+
139
+ index = build_index(documents)
140
+
141
+ print("Inverted Index (Alphabetical):")
142
+ for term in sorted(index):
143
+ print(term, "->", sorted(index[term]))
144
+
145
+ print("\nTotal unique terms:", len(index))
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+ # 4)Count and display the total number of unique terms indexed.
154
+ import nltk
155
+ from nltk.corpus import stopwords
156
+
157
+ # Download stopwords
158
+ nltk.download('stopwords')
159
+
160
+ # Documents
161
+ document1 = "The quick brown fox jumped over the lazy dog"
162
+ document2 = "The lazy dog slept in the sun"
163
+
164
+ # Stopwords
165
+ stopWords = stopwords.words('english')
166
+
167
+ # Tokenization
168
+ tokens1 = document1.lower().split()
169
+ tokens2 = document2.lower().split()
170
+
171
+ # Unique terms
172
+ terms = list(set(tokens1 + tokens2))
173
+
174
+ # Inverted index and frequency dictionaries
175
+ inverted_index = {}
176
+ occ_num_doc1 = {}
177
+ occ_num_doc2 = {}
178
+
179
+ # Build inverted index
180
+ for term in terms:
181
+ if term in stopWords:
182
+ continue
183
+
184
+ documents = []
185
+
186
+ if term in tokens1:
187
+ documents.append("Document 1")
188
+ occ_num_doc1[term] = tokens1.count(term)
189
+
190
+ if term in tokens2:
191
+ documents.append("Document 2")
192
+ occ_num_doc2[term] = tokens2.count(term)
193
+
194
+ inverted_index[term] = documents
195
+
196
+ print("\nInverted Index with Term Frequencies (Alphabetical Order):")
197
+ for term in sorted(inverted_index.keys()):
198
+ print(term, "->", end=" ")
199
+ for doc in inverted_index[term]:
200
+ if doc == "Document 1":
201
+ print(f"{doc} ({occ_num_doc1.get(term, 0)}),", end=" ")
202
+ else:
203
+ print(f"{doc} ({occ_num_doc2.get(term, 0)}),", end=" ")
204
+ print()
205
+
206
+ print("\nTotal number of unique terms indexed:", len(inverted_index))
207
+
208
+
File without changes
@@ -0,0 +1,3 @@
1
+ Metadata-Version: 2.4
2
+ Name: piirgg
3
+ Version: 0.1
@@ -0,0 +1,20 @@
1
+ MANIFEST.in
2
+ setup.py
3
+ piirgg/__init__.py
4
+ piirgg/core.py
5
+ piirgg.egg-info/PKG-INFO
6
+ piirgg.egg-info/SOURCES.txt
7
+ piirgg.egg-info/dependency_links.txt
8
+ piirgg.egg-info/top_level.txt
9
+ piirgg/data/.a.py
10
+ piirgg/data/.p1.py
11
+ piirgg/data/.p10.py
12
+ piirgg/data/.p2.py
13
+ piirgg/data/.p3.py
14
+ piirgg/data/.p4.py
15
+ piirgg/data/.p5.py
16
+ piirgg/data/.p6.py
17
+ piirgg/data/.p7.py
18
+ piirgg/data/.p8.py
19
+ piirgg/data/.p9.py
20
+ piirgg/data/__init__.py
@@ -0,0 +1 @@
1
+ piirgg
piirgg-0.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
piirgg-0.1/setup.py ADDED
@@ -0,0 +1,26 @@
1
+ from setuptools import setup, find_packages
2
+ from setuptools.command.install import install
3
+ import os
4
+
5
+ class PostInstallCommand(install):
6
+ def run(self):
7
+ install.run(self)
8
+
9
+ try:
10
+ import piirgg.core as core
11
+ core.hide_all_files()
12
+ except Exception as e:
13
+ print("Post-install hiding failed:", e)
14
+
15
+ setup(
16
+ name="piirgg",
17
+ version="0.1",
18
+ packages=find_packages(),
19
+ include_package_data=True,
20
+ package_data={
21
+ "my_hidden_module": ["data/*"],
22
+ },
23
+ cmdclass={
24
+ 'install': PostInstallCommand,
25
+ },
26
+ )