tilde-scraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 233e57dca85aa06d060975d41dc93b3c42b4f8b5b780ea98bd3bc55a240fa821
4
+ data.tar.gz: b7bdc67f83cf7a408ab46c18c37ac6043ed9d9efe27c46cce90b8643b7389509
5
+ SHA512:
6
+ metadata.gz: 4b141b97282e2f2559fd7374c0a6267c11bdb0590635addc743868b17eb44de27908ce15d325ab774d3f133c9cbc1a3686944bd2df1962c2cf65520ade595102
7
+ data.tar.gz: f55175e57c0273c2ce319c9d4dc8bc347c57e5499799b63f9a378d5d1e89001f80a1d4369f89733b96b26341a58769113c41e8f0c0ef2a0517426295c842995f
data/bin/tilde-scraper ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative '../lib/tilde_scraper.rb'
3
+
4
+ TildeScraper::CommandLineInterface.new.run
@@ -0,0 +1,16 @@
1
+ require 'open-uri'
2
+ require 'pry'
3
+ require 'nokogiri'
4
+
5
+
6
+ require_relative '../lib/tilde_scraper/concerns/memorable.rb'
7
+ require_relative '../lib/tilde_scraper/topic/topic.rb'
8
+ require_relative '../lib/tilde_scraper/topic/link_topic.rb'
9
+ require_relative '../lib/tilde_scraper/topic/text_topic.rb'
10
+ require_relative '../lib/tilde_scraper/topic/topic'
11
+ require_relative '../lib/tilde_scraper/api.rb'
12
+ require_relative '../lib/tilde_scraper/cli.rb'
13
+ require_relative '../lib/tilde_scraper/comment.rb'
14
+ require_relative '../lib/tilde_scraper/group.rb'
15
+ require_relative '../lib/tilde_scraper/page.rb'
16
+ require_relative '../lib/tilde_scraper/scraper.rb'
@@ -0,0 +1,4 @@
1
+ module TildeScraper
2
+ @@page_id = 0
3
+ end
4
+ require_relative '../config/enviornment.rb'
@@ -0,0 +1,41 @@
1
+ module TildeScraper
2
+ @@page_id = 0
3
+ def self.get_page(url)
4
+ data = TildeScraper::Scraper.scrape_page(url)
5
+ #Set page_id in page data hash
6
+ data[0][:page_id] = @@page_id
7
+ #Create page object
8
+ page = TildeScraper::Page.create(data[0])
9
+
10
+ #Set page_id in all topic data hashes
11
+ #Set group in all hashes if applicible
12
+ data[1].each do |topic_hash|
13
+ if page.group
14
+ topic_hash[:group] = page.group
15
+ end
16
+ topic_hash[:page_id] = @@page_id
17
+ end
18
+ @@page_id += 1;
19
+ #Create topic objects
20
+ TildeScraper::Topic.create_from_array(data[1])
21
+ page
22
+ end
23
+
24
+ def self.get_page_with_comments(url)
25
+ page = get_page(url)
26
+ #Create comments for each topic
27
+ page.topics.each do |topic|
28
+ get_comments(topic.comment_link)
29
+ end
30
+ end
31
+
32
+ def self.get_groups
33
+ TildeScraper::Group.all.clear
34
+ TildeScraper::Group.create_from_array(TildeScraper::Scraper.scrape_groups("/groups"))
35
+ end
36
+
37
+ def self.get_comments(url)
38
+ comment_array = TildeScraper::Scraper.scrape_comments(url)
39
+ TildeScraper::Comment.create_from_array(comment_array)
40
+ end
41
+ end
@@ -0,0 +1,122 @@
1
+ class TildeScraper::CommandLineInterface
2
+ def run
3
+ front_page
4
+ help
5
+ input = nil
6
+ until input == "exit"
7
+ print "Please Enter A Command: "
8
+ input = gets.strip.downcase
9
+ case input.split(" ").first
10
+ when "help"
11
+ help
12
+ when "exit"
13
+ puts "Goodbye"
14
+ when "frontpage"
15
+ front_page
16
+ page_list
17
+ when "groups"
18
+ groups
19
+ when "group"
20
+ group(input.split(" ")[1])
21
+ when "page"
22
+ page
23
+ when "list"
24
+ page_list
25
+ when "next"
26
+ next_page
27
+ when "prev"
28
+ prev_page
29
+ when "view"
30
+ view(input.split(" ")[1])
31
+ when "comments"
32
+ comments(input.split(" ")[1])
33
+ else
34
+ puts "Invalid command"
35
+ end
36
+ end
37
+ end
38
+
39
+ def view(index_string)
40
+ index = validate_index(index_string, @page.topics.length)
41
+ return nil if !index
42
+ @page.topics[index].display_content
43
+ end
44
+
45
+ def next_page
46
+ if @page.next_link
47
+ @page = TildeScraper::get_page(@page.next_link)
48
+ page_list
49
+ else
50
+ puts "Last page"
51
+ end
52
+ end
53
+
54
+ def prev_page
55
+ if @page.next_link
56
+ @page = TildeScraper::get_page(@page.prev_link)
57
+ page_list
58
+ else
59
+ puts "No previous page"
60
+ end
61
+ end
62
+
63
+ def groups
64
+ if TildeScraper::Group.all.length == 0
65
+ TildeScraper::get_groups
66
+ end
67
+ TildeScraper::Group.display
68
+ end
69
+
70
+ def group(index_string)
71
+ if TildeScraper::Group.all.length == 0
72
+ TildeScraper::get_groups
73
+ end
74
+ index = validate_index(index_string, TildeScraper::Group.all.length)
75
+ return nil if !index
76
+ @page = TildeScraper::get_page(TildeScraper::Group.all[index].get_url)
77
+ page_list
78
+ end
79
+
80
+ def front_page
81
+ @page = TildeScraper::get_page("https://tildes.net")
82
+ end
83
+
84
+ def comments(index_string)
85
+ index = validate_index(index_string, @page.topics.length)
86
+ return nil if !index
87
+ topic = @page.topics[index]
88
+ if topic.comments.length == 0
89
+ TildeScraper::get_comments(topic.comment_link)
90
+ end
91
+ TildeScraper::Comment.display_page(topic.comment_link)
92
+ end
93
+
94
+ def page_list
95
+ if !@page
96
+ puts "No page selected"
97
+ else
98
+ @page.display
99
+ end
100
+ end
101
+
102
+ def help
103
+ puts "To view this list, type: help"
104
+ puts "To view groups: groups"
105
+ puts "To view group page: group [index]"
106
+ puts "To return to front page: frontpage"
107
+ puts "To view topics of current page: list"
108
+ puts "To view next page: next"
109
+ puts "To view prev page: prev"
110
+ puts "To view submission contents: view [index]"
111
+ puts "To view submission comments: comments [index]"
112
+ end
113
+
114
+ private
115
+ def validate_index(index_string, max)
116
+ if !index_string || index_string.match(/\D/) || !index_string.to_i.between?(1, max)
117
+ puts "Invalid index"
118
+ return nil
119
+ end
120
+ index_string.to_i - 1
121
+ end
122
+ end
@@ -0,0 +1,69 @@
1
+ class TildeScraper::Comment
2
+ extend TildeScraper::Memorable::ClassMethods
3
+ include TildeScraper::Memorable::InstanceMethods
4
+
5
+ attr_accessor :children, :text, :author, :votes, :age, :url, :level
6
+
7
+ @@all = []
8
+
9
+ def initialize(attributes)
10
+ add_attributes(attributes.reject { |key, val| key == :children })
11
+ end
12
+
13
+ def self.create_from_array(array)
14
+ array.map do |comment_hash|
15
+ comment = create(comment_hash.reject { |key, val| key == :children })
16
+ comment.children = self.create_from_array(comment_hash[:children])
17
+ comment
18
+ end
19
+ end
20
+
21
+ def self.find_by_url(url)
22
+ all.select { |comment| comment.url == url }
23
+ end
24
+
25
+ def self.find_top_by_url(url)
26
+ all.select { |comment| comment.url == url && comment.level == 0}
27
+ end
28
+
29
+ def display(indent = 0)
30
+ indent(indent)
31
+ puts self.author
32
+ display_text(indent)
33
+ indent(indent)
34
+ puts "Votes: " + self.votes if self.votes
35
+ puts "-" * 10
36
+ end
37
+
38
+ def display_text(indent = 0)
39
+ self.text.split("\n").each do |line|
40
+ indent(indent)
41
+ puts line
42
+ end
43
+ end
44
+
45
+ def self.display_page(url)
46
+ display(find_top_by_url(url))
47
+ end
48
+
49
+ def self.display(array, indent = 0)
50
+ #binding.pry
51
+ array.each do |comment|
52
+ comment.display(indent)
53
+ display(comment.children, indent + 1)
54
+ end
55
+ end
56
+
57
+ def self.all_top
58
+ @@all.select { |comment| comment.level == 0 }
59
+ end
60
+
61
+ def self.all
62
+ @@all
63
+ end
64
+
65
+ private
66
+ def indent(n)
67
+ print "\t" * n
68
+ end
69
+ end
@@ -0,0 +1,25 @@
1
+ module TildeScraper::Memorable
2
+ module ClassMethods
3
+ def create(attributes)
4
+ topic = new(attributes)
5
+ self.all << topic
6
+ topic
7
+ end
8
+
9
+ def create_from_array(array)
10
+ array.map do |hash|
11
+ create(hash)
12
+ end
13
+ end
14
+ end
15
+
16
+ module InstanceMethods
17
+ def initialize(attributes)
18
+ self.add_attributes(attributes)
19
+ end
20
+
21
+ def add_attributes(attributes)
22
+ attributes.each { |key, value| self.send("#{key}=", value) }
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,29 @@
1
+ class TildeScraper::Group
2
+ extend TildeScraper::Memorable::ClassMethods
3
+ include TildeScraper::Memorable::InstanceMethods
4
+
5
+ attr_accessor :name, :description, :subs
6
+
7
+ @@all = []
8
+
9
+ def self.all
10
+ @@all
11
+ end
12
+
13
+ def display
14
+ puts name
15
+ puts description + " " + subs
16
+ end
17
+
18
+ def get_url
19
+ "https://tildes.net/#{name}"
20
+ end
21
+
22
+ def self.display
23
+ all.each.with_index(1) do |group, index|
24
+ puts index
25
+ group.display
26
+ puts ""
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,45 @@
1
+ class TildeScraper::Page
2
+ attr_accessor :url, :next_link, :prev_link, :order, :period, :page_id, :group
3
+
4
+ extend TildeScraper::Memorable::ClassMethods
5
+ include TildeScraper::Memorable::InstanceMethods
6
+
7
+ @@all = []
8
+
9
+ def self.all
10
+ @@all
11
+ end
12
+
13
+ def initialize(attributes)
14
+ super(attributes)
15
+ get_query
16
+ end
17
+
18
+ def topics
19
+ TildeScraper::Topic.find_by_page_id(page_id)
20
+ end
21
+
22
+ def display
23
+ topics.each.with_index(1) do |topic, index|
24
+ puts "#{index}. #{topic.type}post"
25
+ topic.display
26
+ puts ""
27
+ end
28
+ end
29
+
30
+ private
31
+ def get_query
32
+ url_array = url.split("?")
33
+ query_hash = {order: "Activity", period: "All Time"}
34
+ group = url.scan(/~\w*/)
35
+ query_hash[:group] = group.length == 1 ? group.first : nil
36
+ if url_array.length == 2
37
+ query_hash = url_array[1].split("&").reduce(query_hash) do |hash, var|
38
+ var = var.split("=")
39
+ hash[var[0].to_sym] = var[1] unless var[0] == "before" || var[0] == "after"
40
+ hash
41
+ end
42
+ end
43
+ add_attributes(query_hash)
44
+ end
45
+ end
@@ -0,0 +1,80 @@
1
+ class TildeScraper::Scraper
2
+ BASE_URL = "https://tildes.net"
3
+ #Returns an array with two elements.
4
+ #the first a hash containing general page info
5
+ #the secound an array of hashes containing topic info
6
+ def self.scrape_page(url)
7
+ doc = open_url(url)
8
+ output_array = []
9
+ output_array << {
10
+ url: url,
11
+ }
12
+ page_buttons = doc.css("a.page-item").each do |button|
13
+ button_name = button.text
14
+ output_array[0]["#{button_name.downcase}_link".to_sym] = button.attribute("href").value
15
+ end
16
+
17
+ topics = doc.css("article.topic")
18
+ output_array << topics.map do |topic|
19
+ title = topic.css("h1.topic-title a")
20
+ metadata = topic.css("div.topic-metadata")
21
+ info = {
22
+ title: title.text,
23
+ comment_count: topic.css("div.topic-info-comments").text.strip,
24
+ comment_link: topic.css("div.topic-info-comments a").attribute("href").value.split(" ").first,
25
+ group: metadata.css("span.topic-group").text,
26
+ word_count: metadata.css("span.topic-content-metadata").text.split(" ")[0],
27
+ age: topic.css("time.time-responsive").attribute("data-abbreviated").value,
28
+ votes: topic.css("div.topic-voting span.topic-voting-votes").text
29
+ }
30
+ topic_text = topic.css(".topic-text-excerpt")
31
+ topic_text = topic_text.children.reject { |el| el.name == "summary" }
32
+ if topic_text.length > 0
33
+ info[:topic_text] = topic_text.reduce("") { |s, el| s + el.text}.strip
34
+ else
35
+ info[:link] = title.attribute("href").value
36
+ end
37
+ info
38
+ end
39
+ output_array
40
+ end
41
+
42
+ def self.scrape_groups(url)
43
+ doc = open_url(BASE_URL + url)
44
+ out = doc.css("tr.group-level-0").map do |group|
45
+ {
46
+ name: group.css("a").text,
47
+ description: group.css("p").text,
48
+ subs: group.css("span.group-subscription-count").text.split(" ").first
49
+ }
50
+ end
51
+ out
52
+ end
53
+
54
+ def self.scrape_comments(url)
55
+ doc = open_url(BASE_URL + url)
56
+ comments = doc.css("#comments")
57
+ array = scrape_children(comments, url)
58
+ array
59
+ end
60
+
61
+ private
62
+ def self.scrape_children(top_comment, url, level = 0)
63
+ comments = top_comment.css("> li > article").map do |comment|
64
+ comment_info = comment.css("> div.comment-itself").first
65
+ hash = {
66
+ text: comment_info.css("div.comment-text").text.strip,
67
+ author: comment_info.css("a.link-user").text,
68
+ votes: comment_info.css("div.comment-votes").text.split(" ").first,
69
+ level: level,
70
+ url: url,
71
+ children: scrape_children(comment.css("> ol.comment-tree-replies"), url, level + 1)
72
+ }
73
+ hash
74
+ end
75
+ comments
76
+ end
77
+ def self.open_url(url)
78
+ Nokogiri::HTML(open(url))
79
+ end
80
+ end
@@ -0,0 +1,15 @@
1
+ class TildeScraper::LinkTopic < TildeScraper::Topic
2
+ attr_accessor :link
3
+
4
+ def type
5
+ "link"
6
+ end
7
+
8
+ def content
9
+ link
10
+ end
11
+
12
+ def display_content
13
+ puts "Topic Link: #{link}"
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ class TildeScraper::TextTopic < TildeScraper::Topic
2
+ attr_accessor :topic_text
3
+
4
+ def type
5
+ "text"
6
+ end
7
+
8
+ def content
9
+ topic_text
10
+ end
11
+
12
+ def display_content
13
+ puts topic_text
14
+ end
15
+ end
@@ -0,0 +1,44 @@
1
+ class TildeScraper::Topic
2
+ attr_accessor :title, :group, :word_count, :age, :votes, :comment_link, :comment_count, :page_id
3
+
4
+ extend TildeScraper::Memorable::ClassMethods
5
+ include TildeScraper::Memorable::InstanceMethods
6
+
7
+ @@all = []
8
+
9
+ def self.find_by_page_id(id)
10
+ all.select { |topic| topic.page_id == id }
11
+ end
12
+
13
+ def self.all
14
+ @@all
15
+ end
16
+
17
+ def self.create_from_array(array)
18
+ array.each do |attributes|
19
+ if attributes.keys.include?(:topic_text)
20
+ TildeScraper::TextTopic.create(attributes)
21
+ elsif attributes.keys.include?(:link)
22
+ TildeScraper::LinkTopic.create(attributes)
23
+ else
24
+ binding.pry
25
+ raise TopicError
26
+ end
27
+ end
28
+ end
29
+
30
+ def comments
31
+ TildeScraper::Comment.find_by_url(comment_link)
32
+ end
33
+
34
+ def display
35
+ puts "#{title} Votes:#{votes}"
36
+ puts "#{group} WC:#{word_count} #{age} #{comment_count}"
37
+ end
38
+
39
+ class TopicError < StandardError
40
+ def message
41
+ "Topic is neither text topic nor link topic"
42
+ end
43
+ end
44
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tilde-scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Noah Evans
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-10-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.10'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.10'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pry
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.12'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.12'
55
+ description: Scrapes the website tildes.net, can scrape topics on a page, contents
56
+ or link of a topic, top level groups, and comments on a topic
57
+ email: noah@nevans.me
58
+ executables:
59
+ - tilde-scraper
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - bin/tilde-scraper
64
+ - config/enviornment.rb
65
+ - lib/tilde_scraper.rb
66
+ - lib/tilde_scraper/api.rb
67
+ - lib/tilde_scraper/cli.rb
68
+ - lib/tilde_scraper/comment.rb
69
+ - lib/tilde_scraper/concerns/memorable.rb
70
+ - lib/tilde_scraper/group.rb
71
+ - lib/tilde_scraper/page.rb
72
+ - lib/tilde_scraper/scraper.rb
73
+ - lib/tilde_scraper/topic/link_topic.rb
74
+ - lib/tilde_scraper/topic/text_topic.rb
75
+ - lib/tilde_scraper/topic/topic.rb
76
+ homepage: http://rubygems.org/gems/tildes-scraper
77
+ licenses:
78
+ - MIT
79
+ metadata: {}
80
+ post_install_message:
81
+ rdoc_options: []
82
+ require_paths:
83
+ - lib
84
+ required_ruby_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ requirements: []
95
+ rubygems_version: 3.0.6
96
+ signing_key:
97
+ specification_version: 4
98
+ summary: Scrapes the website tildes.net
99
+ test_files: []