pyinspect-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyinspect_cli-0.1.0/PKG-INFO +27 -0
- pyinspect_cli-0.1.0/README.md +16 -0
- pyinspect_cli-0.1.0/pyinspect_cli/__init__.py +0 -0
- pyinspect_cli-0.1.0/pyinspect_cli/cli.py +43 -0
- pyinspect_cli-0.1.0/pyinspect_cli/modules/__init__.py +0 -0
- pyinspect_cli-0.1.0/pyinspect_cli/modules/m01.py +41 -0
- pyinspect_cli-0.1.0/pyinspect_cli/modules/m02.py +111 -0
- pyinspect_cli-0.1.0/pyinspect_cli/modules/m03.py +19 -0
- pyinspect_cli-0.1.0/pyinspect_cli/modules/m04.py +24 -0
- pyinspect_cli-0.1.0/pyinspect_cli/modules/m05.py +82 -0
- pyinspect_cli-0.1.0/pyinspect_cli/modules/m06.py +40 -0
- pyinspect_cli-0.1.0/pyinspect_cli/modules/m07.py +117 -0
- pyinspect_cli-0.1.0/pyinspect_cli/modules/m08.py +44 -0
- pyinspect_cli-0.1.0/pyinspect_cli.egg-info/PKG-INFO +27 -0
- pyinspect_cli-0.1.0/pyinspect_cli.egg-info/SOURCES.txt +18 -0
- pyinspect_cli-0.1.0/pyinspect_cli.egg-info/dependency_links.txt +1 -0
- pyinspect_cli-0.1.0/pyinspect_cli.egg-info/entry_points.txt +2 -0
- pyinspect_cli-0.1.0/pyinspect_cli.egg-info/top_level.txt +1 -0
- pyinspect_cli-0.1.0/pyproject.toml +24 -0
- pyinspect_cli-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyinspect-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A CLI tool to inspect practical source codes
|
|
5
|
+
Author-email: Developer <developer@example.com>
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.7
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# pyinspect-cli
|
|
13
|
+
|
|
14
|
+
A CLI tool to inspect practical source codes.
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install -e .
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Usage
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pyinspect-cli list
|
|
26
|
+
pyinspect-cli 1
|
|
27
|
+
```
|
|
File without changes
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
def main():
|
|
5
|
+
args = sys.argv[1:]
|
|
6
|
+
|
|
7
|
+
if not args:
|
|
8
|
+
print("Usage: pyinspect-cli <number> | pyinspect-cli list")
|
|
9
|
+
sys.exit(1)
|
|
10
|
+
|
|
11
|
+
cmd = args[0]
|
|
12
|
+
|
|
13
|
+
# Get the directory of this file
|
|
14
|
+
package_dir = os.path.dirname(os.path.abspath(__file__))
|
|
15
|
+
modules_dir = os.path.join(package_dir, 'modules')
|
|
16
|
+
|
|
17
|
+
if cmd == 'list':
|
|
18
|
+
for i in range(1, 9):
|
|
19
|
+
print(f"{i} - Module {i}")
|
|
20
|
+
return
|
|
21
|
+
|
|
22
|
+
if cmd.isdigit():
|
|
23
|
+
module_num = int(cmd)
|
|
24
|
+
if 1 <= module_num <= 8:
|
|
25
|
+
file_name = f"m{module_num:02d}.py"
|
|
26
|
+
file_path = os.path.join(modules_dir, file_name)
|
|
27
|
+
|
|
28
|
+
if os.path.exists(file_path):
|
|
29
|
+
with open(file_path, 'r') as f:
|
|
30
|
+
print(f.read())
|
|
31
|
+
else:
|
|
32
|
+
print(f"Error: Module file {file_name} not found.")
|
|
33
|
+
sys.exit(1)
|
|
34
|
+
else:
|
|
35
|
+
print("Error: Please provide a module number between 1 and 8.")
|
|
36
|
+
sys.exit(1)
|
|
37
|
+
else:
|
|
38
|
+
print(f"Error: Invalid command or module number '{cmd}'.")
|
|
39
|
+
print("Usage: pyinspect-cli <number> | pyinspect-cli list")
|
|
40
|
+
sys.exit(1)
|
|
41
|
+
|
|
42
|
+
if __name__ == "__main__":
|
|
43
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import nltk
|
|
2
|
+
from nltk.corpus import stopwords
|
|
3
|
+
|
|
4
|
+
document1 = "The quick brown fox jumped over a lazy dog"
|
|
5
|
+
document2 = "The lazy dog slept in the sun"
|
|
6
|
+
|
|
7
|
+
nltk.download('stopwords')
|
|
8
|
+
stopWords = stopwords.words('english')
|
|
9
|
+
|
|
10
|
+
tokens1 = document1.lower().split()
|
|
11
|
+
tokens2 = document2.lower().split()
|
|
12
|
+
terms = list(set(tokens1 + tokens2))
|
|
13
|
+
|
|
14
|
+
inverted_index = {}
|
|
15
|
+
occ_num_doc1 = {}
|
|
16
|
+
occ_num_doc2 = {}
|
|
17
|
+
|
|
18
|
+
for term in terms:
|
|
19
|
+
if term in stopWords:
|
|
20
|
+
continue
|
|
21
|
+
|
|
22
|
+
document = []
|
|
23
|
+
|
|
24
|
+
if term in tokens1:
|
|
25
|
+
document.append("Document 1")
|
|
26
|
+
occ_num_doc1[term] = tokens1.count(term)
|
|
27
|
+
|
|
28
|
+
if term in tokens2:
|
|
29
|
+
document.append("Document 2")
|
|
30
|
+
occ_num_doc2[term] = tokens2.count(term)
|
|
31
|
+
|
|
32
|
+
inverted_index[term] = document
|
|
33
|
+
|
|
34
|
+
for term, documents in inverted_index.items():
|
|
35
|
+
print(term, "->", end="")
|
|
36
|
+
for doc in documents:
|
|
37
|
+
if doc == "Document 1":
|
|
38
|
+
print(f"{doc}({occ_num_doc1.get(term, 0)})", end=", ")
|
|
39
|
+
else:
|
|
40
|
+
print(f"{doc}({occ_num_doc2.get(term, 0)})", end=", ")
|
|
41
|
+
print()
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
#Part A
|
|
2
|
+
documents = {
|
|
3
|
+
1: "apple banana orange",
|
|
4
|
+
2: "apple banana",
|
|
5
|
+
3: "banana orange",
|
|
6
|
+
4: "apple"
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
def build_index(docs):
|
|
10
|
+
index = {}
|
|
11
|
+
for doc_id, text in docs.items():
|
|
12
|
+
terms = set(text.split())
|
|
13
|
+
for term in terms:
|
|
14
|
+
if term not in index:
|
|
15
|
+
index[term] = {doc_id}
|
|
16
|
+
else:
|
|
17
|
+
index[term].add(doc_id)
|
|
18
|
+
return index
|
|
19
|
+
|
|
20
|
+
inverted_index = build_index(documents)
|
|
21
|
+
|
|
22
|
+
def boolean_and(operands, index):
|
|
23
|
+
if not operands:
|
|
24
|
+
return list(range(1, len(documents) + 1))
|
|
25
|
+
|
|
26
|
+
result = index.get(operands[0], set())
|
|
27
|
+
for term in operands[1:]:
|
|
28
|
+
result = result.intersection(index.get(term, set()))
|
|
29
|
+
|
|
30
|
+
return list(result)
|
|
31
|
+
|
|
32
|
+
def boolean_or(operands, index):
|
|
33
|
+
result = set()
|
|
34
|
+
for term in operands:
|
|
35
|
+
result = result.union(index.get(term, set()))
|
|
36
|
+
|
|
37
|
+
return list(result)
|
|
38
|
+
|
|
39
|
+
def boolean_not(operand, index, total_docs):
|
|
40
|
+
operand_set = set(index.get(operand, set()))
|
|
41
|
+
all_docs_set = set(range(1, total_docs + 1))
|
|
42
|
+
return list(all_docs_set.difference(operand_set))
|
|
43
|
+
|
|
44
|
+
query1 = ["apple", "banana"]
|
|
45
|
+
query2 = ["apple", "orange"]
|
|
46
|
+
|
|
47
|
+
result1 = boolean_and(query1, inverted_index)
|
|
48
|
+
result2 = boolean_or(query2, inverted_index)
|
|
49
|
+
result3 = boolean_not("orange", inverted_index, len(documents))
|
|
50
|
+
|
|
51
|
+
print("Documents containing 'apple' and 'banana':", result1)
|
|
52
|
+
print("Documents containing 'apple' and 'orange':", result2)
|
|
53
|
+
print("Documents not containing 'orange':", result3)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# Part B:
|
|
63
|
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
|
64
|
+
import nltk
|
|
65
|
+
from nltk.corpus import stopwords
|
|
66
|
+
import numpy as np
|
|
67
|
+
from numpy.linalg import norm
|
|
68
|
+
|
|
69
|
+
# Training and testing datasets
|
|
70
|
+
train_set = ["The sky is blue.", "The sun is bright."]
|
|
71
|
+
test_set = ["The sun in the sky is bright."]
|
|
72
|
+
|
|
73
|
+
# Download stopwords and set them
|
|
74
|
+
nltk.download('stopwords')
|
|
75
|
+
stopWords = stopwords.words('english')
|
|
76
|
+
|
|
77
|
+
# Create CountVectorizer and TfidfTransformer
|
|
78
|
+
vectorizer = CountVectorizer(stop_words=stopWords)
|
|
79
|
+
transformer = TfidfTransformer()
|
|
80
|
+
|
|
81
|
+
# Fit and transform the training set
|
|
82
|
+
trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
|
|
83
|
+
testVectorizerArray = vectorizer.transform(test_set).toarray()
|
|
84
|
+
|
|
85
|
+
# Print vectorized results
|
|
86
|
+
print("Fit Vectorizer to train set:", trainVectorizerArray)
|
|
87
|
+
print("Transformer Vectorizer to test set:", testVectorizerArray)
|
|
88
|
+
|
|
89
|
+
# Cosine similarity function
|
|
90
|
+
cx = lambda a, b: round(np.inner(a, b) / (norm(a) * norm(b)), 3)
|
|
91
|
+
|
|
92
|
+
# Compute cosine similarity
|
|
93
|
+
for vector in trainVectorizerArray:
|
|
94
|
+
print(vector)
|
|
95
|
+
|
|
96
|
+
for testV in testVectorizerArray:
|
|
97
|
+
print(testV)
|
|
98
|
+
cosine = cx(vector, testV)
|
|
99
|
+
print(cosine)
|
|
100
|
+
|
|
101
|
+
# Fit the TF-IDF transformer
|
|
102
|
+
transformer.fit(testVectorizerArray)
|
|
103
|
+
print()
|
|
104
|
+
print(transformer.transform(trainVectorizerArray).toarray())
|
|
105
|
+
|
|
106
|
+
transformer.fit(testVectorizerArray)
|
|
107
|
+
print()
|
|
108
|
+
|
|
109
|
+
# Transform test set using TF-IDF
|
|
110
|
+
tfidf = transformer.transform(testVectorizerArray)
|
|
111
|
+
print(tfidf.todense())
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
def editDistance(str1, str2, m, n):
|
|
2
|
+
if m == 0:
|
|
3
|
+
return n
|
|
4
|
+
if n == 0:
|
|
5
|
+
return m
|
|
6
|
+
if str1[m - 1] == str2[n - 1]:
|
|
7
|
+
return editDistance(str1, str2, m - 1, n - 1)
|
|
8
|
+
|
|
9
|
+
return 1 + min(
|
|
10
|
+
editDistance(str1, str2, m, n - 1), # Insert
|
|
11
|
+
editDistance(str1, str2, m - 1, n), # Remove
|
|
12
|
+
editDistance(str1, str2, m - 1, n - 1) # Replace
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
# Test case
|
|
16
|
+
str1 = "sunday"
|
|
17
|
+
str2 = "saturday"
|
|
18
|
+
|
|
19
|
+
print("Edit Distance is:", editDistance(str1, str2, len(str1), len(str2)))
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
def calculate_metrics(retrieved_set, relevant_set):
|
|
2
|
+
true_positive = len(retrieved_set.intersection(relevant_set))
|
|
3
|
+
false_positive = len(retrieved_set.difference(relevant_set))
|
|
4
|
+
false_negative = len(relevant_set.difference(retrieved_set))
|
|
5
|
+
|
|
6
|
+
print("True Positive:", true_positive,
|
|
7
|
+
"\nFalse Positive:", false_positive,
|
|
8
|
+
"\nFalse Negative:", false_negative, "\n")
|
|
9
|
+
|
|
10
|
+
precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
|
|
11
|
+
recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
|
|
12
|
+
f_measure = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
|
|
13
|
+
|
|
14
|
+
return precision, recall, f_measure
|
|
15
|
+
|
|
16
|
+
# Example data
|
|
17
|
+
retrieved_set = set(["doc1", "doc2", "doc3"])
|
|
18
|
+
relevant_set = set(["doc1", "doc4"])
|
|
19
|
+
|
|
20
|
+
precision, recall, f_measure = calculate_metrics(retrieved_set, relevant_set)
|
|
21
|
+
|
|
22
|
+
print(f"Precision: {precision}")
|
|
23
|
+
print(f"Recall: {recall}")
|
|
24
|
+
print(f"F-measure: {f_measure}")
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
class NaiveBayesClassifier:
|
|
5
|
+
def __init__(self):
|
|
6
|
+
self.vocab = []
|
|
7
|
+
self.positive_counts = None
|
|
8
|
+
self.negative_counts = None
|
|
9
|
+
self.positive_prior = 0
|
|
10
|
+
self.negative_prior = 0
|
|
11
|
+
|
|
12
|
+
def preprocess_review(self, review):
|
|
13
|
+
tokens = re.findall(r'\b\w+\b', review.lower())
|
|
14
|
+
return tokens
|
|
15
|
+
|
|
16
|
+
def vectorize_review(self, review):
|
|
17
|
+
vector = np.zeros(len(self.vocab))
|
|
18
|
+
for word in review:
|
|
19
|
+
if word in self.vocab:
|
|
20
|
+
vector[self.vocab.index(word)] += 1
|
|
21
|
+
return vector
|
|
22
|
+
|
|
23
|
+
def train(self, X_train, Y_train):
|
|
24
|
+
all_tokens = []
|
|
25
|
+
for review in X_train:
|
|
26
|
+
tokens = self.preprocess_review(review)
|
|
27
|
+
all_tokens.extend(tokens)
|
|
28
|
+
self.vocab = list(set(all_tokens)) # Unique words
|
|
29
|
+
|
|
30
|
+
self.positive_counts = np.zeros(len(self.vocab))
|
|
31
|
+
self.negative_counts = np.zeros(len(self.vocab))
|
|
32
|
+
|
|
33
|
+
X_train_vectorized = [self.vectorize_review(self.preprocess_review(review)) for review in X_train]
|
|
34
|
+
|
|
35
|
+
positive_count = 0
|
|
36
|
+
for i in range(len(X_train_vectorized)):
|
|
37
|
+
if Y_train[i] == 'positive':
|
|
38
|
+
self.positive_counts += X_train_vectorized[i]
|
|
39
|
+
positive_count += 1
|
|
40
|
+
else:
|
|
41
|
+
self.negative_counts += X_train_vectorized[i]
|
|
42
|
+
|
|
43
|
+
self.positive_prior = positive_count / len(Y_train)
|
|
44
|
+
self.negative_prior = 1 - self.positive_prior
|
|
45
|
+
|
|
46
|
+
def predict(self, X_test):
|
|
47
|
+
predictions = []
|
|
48
|
+
for review in X_test:
|
|
49
|
+
review_vectorized = self.vectorize_review(self.preprocess_review(review))
|
|
50
|
+
|
|
51
|
+
positive_prob = (self.positive_counts + 1) / (sum(self.positive_counts) + len(self.vocab))
|
|
52
|
+
negative_prob = (self.negative_counts + 1) / (sum(self.negative_counts) + len(self.vocab))
|
|
53
|
+
|
|
54
|
+
positive_likelihood = np.prod(positive_prob ** review_vectorized) * self.positive_prior
|
|
55
|
+
negative_likelihood = np.prod(negative_prob ** review_vectorized) * self.negative_prior
|
|
56
|
+
|
|
57
|
+
if positive_likelihood > negative_likelihood:
|
|
58
|
+
predictions.append('positive')
|
|
59
|
+
else:
|
|
60
|
+
predictions.append('negative')
|
|
61
|
+
|
|
62
|
+
return predictions
|
|
63
|
+
|
|
64
|
+
# Training Data
|
|
65
|
+
positive_review = ["The movie was amazing, I like great acting and an engaging plot"]
|
|
66
|
+
negative_review = ["I hate this movie so much, it's terrible!"]
|
|
67
|
+
X_train = positive_review + negative_review
|
|
68
|
+
Y_train = ['positive'] * len(positive_review) + ['negative'] * len(negative_review)
|
|
69
|
+
X_test = ["The acting was superb!"]
|
|
70
|
+
Y_test = ['positive']
|
|
71
|
+
|
|
72
|
+
# Train the classifier
|
|
73
|
+
classifier = NaiveBayesClassifier()
|
|
74
|
+
classifier.train(X_train, Y_train)
|
|
75
|
+
|
|
76
|
+
# Predictions
|
|
77
|
+
predictions = classifier.predict(X_test)
|
|
78
|
+
print("Predicted class for the new review:", predictions[0])
|
|
79
|
+
|
|
80
|
+
# Accuracy Calculation
|
|
81
|
+
accuracy = sum(1 for true_label, predicted_label in zip(Y_test, predictions) if true_label == predicted_label) / len(Y_test)
|
|
82
|
+
print("Accuracy of the model:", accuracy)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
3
|
+
from sklearn.cluster import KMeans
|
|
4
|
+
from sklearn.metrics import silhouette_score
|
|
5
|
+
|
|
6
|
+
# Sample documents
|
|
7
|
+
documents = [
|
|
8
|
+
"Machine learning is the study of computer algorithms that improve automatically through experience.",
|
|
9
|
+
"Deep learning is a subset of machine learning.",
|
|
10
|
+
"Natural language processing is a field of artificial intelligence.",
|
|
11
|
+
"Computer vision is a field of study that enables computers to interpret and understand the visual world.",
|
|
12
|
+
"Reinforcement learning is a type of machine learning algorithm that teaches an agent how to make decisions in an environment by rewarding desired behaviors.",
|
|
13
|
+
"Information retrieval is the process of obtaining information from a collection of documents.",
|
|
14
|
+
"Text mining is the process of deriving high-quality information from text.",
|
|
15
|
+
"Data clustering is the task of dividing a set of objects into groups.",
|
|
16
|
+
"Hierarchical clustering builds a tree of clusters.",
|
|
17
|
+
"K-means clustering is a method of vector quantization."
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
# Convert documents into TF-IDF vectors
|
|
21
|
+
vectorizer = TfidfVectorizer()
|
|
22
|
+
X = vectorizer.fit_transform(documents)
|
|
23
|
+
|
|
24
|
+
# Perform K-means clustering
|
|
25
|
+
k = 3 # Number of clusters
|
|
26
|
+
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
|
27
|
+
kmeans.fit(X)
|
|
28
|
+
|
|
29
|
+
# Evaluate clustering results
|
|
30
|
+
silhouette_avg = silhouette_score(X, kmeans.labels_)
|
|
31
|
+
print("Silhouette Score:", silhouette_avg)
|
|
32
|
+
|
|
33
|
+
# Print clusters
|
|
34
|
+
for i in range(k):
|
|
35
|
+
cluster_docs_indices = np.where(kmeans.labels_ == i)[0]
|
|
36
|
+
cluster_docs = [documents[idx] for idx in cluster_docs_indices]
|
|
37
|
+
|
|
38
|
+
print(f"\nCluster {i+1}:")
|
|
39
|
+
for doc in cluster_docs:
|
|
40
|
+
print("-", doc)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# Part A:
|
|
2
|
+
import requests
|
|
3
|
+
from bs4 import BeautifulSoup
|
|
4
|
+
|
|
5
|
+
class WebCrawler:
|
|
6
|
+
def __init__(self):
|
|
7
|
+
self.visited_urls = set()
|
|
8
|
+
|
|
9
|
+
def crawl(self, url, depth=3):
|
|
10
|
+
if depth == 0 or url in self.visited_urls:
|
|
11
|
+
return
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
response = requests.get(url, timeout=5)
|
|
15
|
+
if response.status_code == 200:
|
|
16
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
|
17
|
+
self.index_page(url, soup)
|
|
18
|
+
self.visited_urls.add(url)
|
|
19
|
+
|
|
20
|
+
for link in soup.find_all('a', href=True):
|
|
21
|
+
new_url = link.get('href')
|
|
22
|
+
if new_url.startswith('http'): # Ensure it's a valid URL
|
|
23
|
+
print(f"Crawling: {new_url}")
|
|
24
|
+
self.crawl(new_url, depth - 1)
|
|
25
|
+
|
|
26
|
+
except Exception as e:
|
|
27
|
+
print(f"Error crawling {url}: {e}")
|
|
28
|
+
|
|
29
|
+
def index_page(self, url, soup):
|
|
30
|
+
"""Extracts and indexes the page title and first paragraph."""
|
|
31
|
+
title = soup.title.string if soup.title else "No title"
|
|
32
|
+
paragraph = soup.find('p').get_text() if soup.find('p') else "No paragraph found"
|
|
33
|
+
|
|
34
|
+
print(f"\nIndexing: {url}")
|
|
35
|
+
print(f"Title: {title}")
|
|
36
|
+
print(f"First Paragraph: {paragraph}\n")
|
|
37
|
+
if __name__ == "__main__":
|
|
38
|
+
start_url = "https://example.com"
|
|
39
|
+
crawler = WebCrawler()
|
|
40
|
+
crawler.crawl(start_url, depth=2)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# Part B:
|
|
47
|
+
import requests
|
|
48
|
+
from bs4 import BeautifulSoup
|
|
49
|
+
import time
|
|
50
|
+
from urllib.parse import urlparse
|
|
51
|
+
|
|
52
|
+
class WebCrawler:
|
|
53
|
+
def __init__(self):
|
|
54
|
+
self.visited_urls = set()
|
|
55
|
+
|
|
56
|
+
def crawl(self, url, depth=3, delay=1):
|
|
57
|
+
if depth == 0 or url in self.visited_urls:
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
# Check robots.txt
|
|
62
|
+
if not self.is_allowed_by_robots(url):
|
|
63
|
+
print(f"Skipping {url} due to robots.txt rules")
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
response = requests.get(url, timeout=5)
|
|
67
|
+
if response.status_code == 200:
|
|
68
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
|
69
|
+
self.index_page(url, soup)
|
|
70
|
+
self.visited_urls.add(url)
|
|
71
|
+
|
|
72
|
+
for link in soup.find_all('a', href=True):
|
|
73
|
+
new_url = link.get('href')
|
|
74
|
+
if new_url.startswith('http'):
|
|
75
|
+
time.sleep(delay) # Delay between requests
|
|
76
|
+
self.crawl(new_url, depth - 1, delay)
|
|
77
|
+
else:
|
|
78
|
+
print(f"Failed to fetch {url}: Status code {response.status_code}")
|
|
79
|
+
|
|
80
|
+
except Exception as e:
|
|
81
|
+
print(f"Error crawling {url}: {e}")
|
|
82
|
+
|
|
83
|
+
def is_allowed_by_robots(self, url):
|
|
84
|
+
parsed_url = urlparse(url)
|
|
85
|
+
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
response = requests.get(robots_url, timeout=5)
|
|
89
|
+
if response.status_code == 200:
|
|
90
|
+
robots_txt = response.text
|
|
91
|
+
if "User-agent: *" in robots_txt:
|
|
92
|
+
start_index = robots_txt.find("User-agent: *")
|
|
93
|
+
end_index = robots_txt.find("User-agent:", start_index + 1)
|
|
94
|
+
|
|
95
|
+
if end_index == -1:
|
|
96
|
+
end_index = len(robots_txt)
|
|
97
|
+
|
|
98
|
+
relevant_section = robots_txt[start_index:end_index]
|
|
99
|
+
|
|
100
|
+
if "Disallow: /" in relevant_section:
|
|
101
|
+
return False
|
|
102
|
+
return True
|
|
103
|
+
except requests.RequestException:
|
|
104
|
+
return True # If robots.txt is unreachable, assume allowed
|
|
105
|
+
|
|
106
|
+
def index_page(self, url, soup):
|
|
107
|
+
title = soup.title.string if soup.title else "No title"
|
|
108
|
+
paragraph = soup.find('p').get_text() if soup.find('p') else "No paragraph found"
|
|
109
|
+
print(f"\nIndexing: {url}")
|
|
110
|
+
print(f"Title: {title}")
|
|
111
|
+
print(f"First Paragraph: {paragraph}")
|
|
112
|
+
print("-------------------------------------------------")
|
|
113
|
+
|
|
114
|
+
# Example usage
|
|
115
|
+
if __name__ == "__main__":
|
|
116
|
+
crawler = WebCrawler()
|
|
117
|
+
crawler.crawl("https://www.mercedes-benz.com")
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# link analysis
|
|
2
|
+
|
|
3
|
+
def pagerank(graph, damping_factor=0.85, epsilon=1.0e-8, max_iterations=100):
|
|
4
|
+
# Initialize PageRank scores
|
|
5
|
+
num_nodes = len(graph)
|
|
6
|
+
pagerank_scores = {node: 1.0 / num_nodes for node in graph}
|
|
7
|
+
|
|
8
|
+
for _ in range(max_iterations):
|
|
9
|
+
new_pagerank_scores = {}
|
|
10
|
+
max_diff = 0
|
|
11
|
+
|
|
12
|
+
for node in graph:
|
|
13
|
+
new_pagerank = (1 - damping_factor) / num_nodes
|
|
14
|
+
for referring_page, links in graph.items():
|
|
15
|
+
if node in links:
|
|
16
|
+
num_outlinks = len(links)
|
|
17
|
+
new_pagerank += damping_factor * (pagerank_scores[referring_page] / num_outlinks)
|
|
18
|
+
|
|
19
|
+
new_pagerank_scores[node] = new_pagerank
|
|
20
|
+
max_diff = max(max_diff, abs(new_pagerank - pagerank_scores[node]))
|
|
21
|
+
|
|
22
|
+
pagerank_scores = new_pagerank_scores
|
|
23
|
+
|
|
24
|
+
if max_diff < epsilon:
|
|
25
|
+
break
|
|
26
|
+
|
|
27
|
+
return pagerank_scores
|
|
28
|
+
|
|
29
|
+
# Create a small web graph
|
|
30
|
+
web_graph = {
|
|
31
|
+
'A': ['B', 'C'],
|
|
32
|
+
'B': ['C'],
|
|
33
|
+
'C': ['A']
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# Compute PageRank scores
|
|
37
|
+
pagerank_scores = pagerank(web_graph)
|
|
38
|
+
|
|
39
|
+
# Analyze the results
|
|
40
|
+
sorted_scores = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)
|
|
41
|
+
|
|
42
|
+
print("PageRank scores:")
|
|
43
|
+
for node, score in sorted_scores:
|
|
44
|
+
print(f"{node}: {score:.6f}")
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyinspect-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A CLI tool to inspect practical source codes
|
|
5
|
+
Author-email: Developer <developer@example.com>
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.7
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# pyinspect-cli
|
|
13
|
+
|
|
14
|
+
A CLI tool to inspect practical source codes.
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install -e .
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Usage
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pyinspect-cli list
|
|
26
|
+
pyinspect-cli 1
|
|
27
|
+
```
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
pyinspect_cli/__init__.py
|
|
4
|
+
pyinspect_cli/cli.py
|
|
5
|
+
pyinspect_cli.egg-info/PKG-INFO
|
|
6
|
+
pyinspect_cli.egg-info/SOURCES.txt
|
|
7
|
+
pyinspect_cli.egg-info/dependency_links.txt
|
|
8
|
+
pyinspect_cli.egg-info/entry_points.txt
|
|
9
|
+
pyinspect_cli.egg-info/top_level.txt
|
|
10
|
+
pyinspect_cli/modules/__init__.py
|
|
11
|
+
pyinspect_cli/modules/m01.py
|
|
12
|
+
pyinspect_cli/modules/m02.py
|
|
13
|
+
pyinspect_cli/modules/m03.py
|
|
14
|
+
pyinspect_cli/modules/m04.py
|
|
15
|
+
pyinspect_cli/modules/m05.py
|
|
16
|
+
pyinspect_cli/modules/m06.py
|
|
17
|
+
pyinspect_cli/modules/m07.py
|
|
18
|
+
pyinspect_cli/modules/m08.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pyinspect_cli
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pyinspect-cli"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A CLI tool to inspect practical source codes"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "Developer", email = "developer@example.com" }
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.7"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.scripts]
|
|
21
|
+
pyinspect-cli = "pyinspect_cli.cli:main"
|
|
22
|
+
|
|
23
|
+
[tool.setuptools]
|
|
24
|
+
packages = ["pyinspect_cli", "pyinspect_cli.modules"]
|