apw_articles 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/apw_articles +5 -0
- data/config/environment.rb +10 -0
- data/lib/apw_articles.rb +5 -0
- data/lib/apw_articles/article.rb +33 -0
- data/lib/apw_articles/category.rb +36 -0
- data/lib/apw_articles/cli.rb +85 -0
- data/lib/apw_articles/scraper.rb +53 -0
- metadata +51 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 762d2ad97904f469c347327ca3bf900ea49ed4af
|
4
|
+
data.tar.gz: 9af397ae00b4c5781ef1d419aa8e6f329397b23e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7e857e447b235f45510310db1fee667d447d52d858991de58b8750cc9dbaa51a3dac448718a5135788660a508640008cbc7b7e1aa346298c4c6c54ced3bf8e3e
|
7
|
+
data.tar.gz: f1dce15cbb89fc5524c23a6b42f1834a8bff0b5fce399b3a1553a100cb63ce2ca6d3150ce628342565d9e05c06ccf1ab91d7ed0fde82390980e04fc97b53a5d3
|
data/bin/apw_articles
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'pry'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'colorize'
|
5
|
+
|
6
|
+
|
7
|
+
require_relative '../lib/apw_articles/article.rb'
|
8
|
+
require_relative '../lib/apw_articles/category.rb'
|
9
|
+
require_relative '../lib/apw_articles/cli.rb'
|
10
|
+
require_relative '../lib/apw_articles/scraper.rb'
|
data/lib/apw_articles.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
class APWArticles::Article
|
2
|
+
attr_accessor :title, :author, :blurb, :url, :categories
|
3
|
+
@@all = []
|
4
|
+
|
5
|
+
def initialize(attribute_hash)
|
6
|
+
self.categories = []
|
7
|
+
attribute_hash.each do |key, value|
|
8
|
+
if key == :categories
|
9
|
+
value.each do |category|
|
10
|
+
c = APWArticles::Category.find_or_create_by_url(category)
|
11
|
+
self.categories << c
|
12
|
+
c.articles << self
|
13
|
+
end
|
14
|
+
else
|
15
|
+
self.send(("#{key}="), value)
|
16
|
+
end
|
17
|
+
@@all << self
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.new_from_url(url)
|
22
|
+
self.new(APWArticles::Scraper.scrape_article(url))
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.new_from_list(list_url)
|
26
|
+
# call scraper for list and then initalize from hash
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.all
|
30
|
+
@@all
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
class APWArticles::Category
|
2
|
+
attr_accessor :name, :articles, :url
|
3
|
+
@@all = []
|
4
|
+
CATEGORIES = ["divorce", "kids-no-kids", "sex", "career", "life", "marriage-essays", "money", "feminism", "essays", "the-hard-stuff", "reclaiming-wife", "advice", "genderfeminism", "friends-relations", "engagements-proposals", "happy-hour"]
|
5
|
+
# NOTE: the CATEGORIES constant was gathered using the APWArticles::Scraper.scrape_categories method, however as the only way to scrape this information was iterating over 66 articles which is rather time-consuming in the CLI, I have elected to hard code the categories as default categories. New category objects will still be made when they're encountered in articles.
|
6
|
+
|
7
|
+
def self.defaults
|
8
|
+
CATEGORIES.each {|url| self.find_or_create_by_url(url)}
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(url)
|
12
|
+
self.name = url.gsub(/-/, ' ').split.map(&:capitalize).join(' ')
|
13
|
+
self.url = url
|
14
|
+
self.class.all << self
|
15
|
+
self.articles = []
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.all
|
19
|
+
@@all
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.find_or_create_by_url(url)
|
23
|
+
if self.all.detect{|category| category.url == url } == nil
|
24
|
+
self.new(url)
|
25
|
+
else
|
26
|
+
self.all.detect{|category| category.url == url }
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.create_from_url
|
31
|
+
APWArticles::Scraper.scrape_categories.each do |category|
|
32
|
+
self.find_or_create_by_url(category)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
class APWArticles::CLI
|
2
|
+
|
3
|
+
def run
|
4
|
+
APWArticles::Category.defaults
|
5
|
+
self.list_categories
|
6
|
+
end # basic functionality
|
7
|
+
|
8
|
+
# Lists categories by iterating over APWARrticles::Category.all and requests input to view article list based on category
|
9
|
+
def list_categories
|
10
|
+
puts "------------ A Practical Wedding - Marriage Essays ------------\n".colorize(:cyan)
|
11
|
+
puts "CATEGORIES:"
|
12
|
+
APWArticles::Category.all.each_with_index do |category, index|
|
13
|
+
print "#{index+1}.\t".colorize(:cyan)
|
14
|
+
puts "#{category.name}"
|
15
|
+
end # do loop end
|
16
|
+
puts "\nPlease choose a category by number".colorize(:blue)
|
17
|
+
input = gets.strip
|
18
|
+
until input.to_i > 0 && input.to_i <= APWArticles::Category.all.size
|
19
|
+
puts "Please type a number between 1 and #{APWArticles::Category.all.size}.".colorize(:blue)
|
20
|
+
input = gets.strip
|
21
|
+
end # until end
|
22
|
+
self.list_articles_in_category_by_page(APWArticles::Category.all[input.to_i-1], 1)
|
23
|
+
end # list_categories def end
|
24
|
+
|
25
|
+
# Based on a category and a page number, this method creates an array of indexes of articles to list, then calls the scraper to scrape the category page to generate article objects for that category. The method then iterates over the array of articles to list, printing the article number and name for the article at that index in the category object's articles array.
|
26
|
+
# Method then asks user to choose an article number to display.
|
27
|
+
def list_articles_in_category_by_page(category, page = 1)
|
28
|
+
articles_to_display = Array (((page*10)-10)..((page*10)-1)) # page 1 = 0-9, page 2 = 10-19
|
29
|
+
puts "\n\n------------ Articles in #{category.name} ------------".colorize(:cyan)
|
30
|
+
APWArticles::Scraper.scrape_list(category.url, page) unless page.between?(2,5) || page.between?(7,12)
|
31
|
+
# NOTE this is very laggy and perhaps shouldn't take place here.
|
32
|
+
articles_to_display.each do |article_num|
|
33
|
+
print "#{article_num+1}.\t".colorize(:cyan) unless
|
34
|
+
category.articles[article_num] == nil
|
35
|
+
puts "#{category.articles[article_num].title}" unless
|
36
|
+
category.articles[article_num] == nil
|
37
|
+
end # do end
|
38
|
+
# NOTE: I might like to split out the input logic here.
|
39
|
+
puts "\n\nType the article number to view more information about the article. \nOr type 'next' to view the next page of articles.".colorize(:blue)
|
40
|
+
input = gets.strip
|
41
|
+
until /(?i)next/ === input || ( input.to_i >= (articles_to_display[0]+1) && input.to_i <= (articles_to_display[-1]+1) )
|
42
|
+
puts "Please type a number between #{articles_to_display[0]+1} and #{articles_to_display[-1]+1} or type 'next'.".colorize(:blue)
|
43
|
+
input = gets.strip
|
44
|
+
end # until end
|
45
|
+
if /(?i)next/ === input
|
46
|
+
page += 1
|
47
|
+
list_articles_in_category_by_page(category, page)
|
48
|
+
else
|
49
|
+
self.article_information(
|
50
|
+
category.articles[input.to_i-1].url)
|
51
|
+
end # if end
|
52
|
+
end # list_articles_in_category_by_page def end
|
53
|
+
|
54
|
+
# This method creates a new Article object from the URL passed to the method and assigns it a local variable. The method then calls instance methods for each of the object's variables (title, author, blurb, url and categories). Then it requests input to view more information or exit.
|
55
|
+
def article_information(article_url)
|
56
|
+
article = APWArticles::Article.new_from_url(article_url)
|
57
|
+
print "\nTitle:".colorize(:cyan)
|
58
|
+
puts "#{article.title}"
|
59
|
+
print "\nAuthor:".colorize(:cyan)
|
60
|
+
puts "#{article.author}"
|
61
|
+
print "\n\nBlurb:".colorize(:cyan)
|
62
|
+
puts "\"#{article.blurb}...\""
|
63
|
+
print "\nURL:".colorize(:cyan)
|
64
|
+
puts "#{article_url}"
|
65
|
+
article_categories = []
|
66
|
+
article.categories.each do |category| # category is an object, and I want its name
|
67
|
+
article_categories << category.name
|
68
|
+
end # do end
|
69
|
+
print "\nCategories:".colorize(:cyan)
|
70
|
+
puts "#{article_categories.join(", ")}."
|
71
|
+
puts "Type 'list' to return to the category list page. To exit, type 'exit'".colorize(:blue)
|
72
|
+
input = gets.strip
|
73
|
+
# validating input
|
74
|
+
until /(?i)exit/ === input || /(?i)list/ === input
|
75
|
+
puts "Please type 'list' or 'exit'.".colorize(:blue)
|
76
|
+
input = gets.strip
|
77
|
+
end # until end
|
78
|
+
if /(?i)exit/ === input
|
79
|
+
abort("Thank you.")
|
80
|
+
elsif /(?i)list/ === input
|
81
|
+
self.list_categories
|
82
|
+
end # if end
|
83
|
+
end # def article_information end
|
84
|
+
|
85
|
+
end # class end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
class APWArticles::Scraper
|
2
|
+
|
3
|
+
# This class method defines variables i and j to determine what url number needs to be scraped based on the number of items on each page (at time of publicaion, 66 articles/page). The method then scrapes a url based on the category and URL number and uses that page's list of articles to creates a new article object per article link. Article objects include title, url and category. The method returns nil
|
4
|
+
def self.scrape_list(category, page = 1)
|
5
|
+
# NOTE: probably I should only scrape for the # of articles I need for the given request / call - this is very laggy
|
6
|
+
i = 1 if page.between?(1,6)
|
7
|
+
i = 2 if page.between?(7,13)
|
8
|
+
i = 3 if page > 13
|
9
|
+
j = 1 if page.between?(1,5)
|
10
|
+
j = 2 if page.between?(6,12)
|
11
|
+
j = 3 if page > 12
|
12
|
+
until i > j
|
13
|
+
Nokogiri::HTML(open("https://apracticalwedding.com/category/marriage-essays/#{category}/page/#{i}/?listas=list")).css(".type-post").each do |post|
|
14
|
+
APWArticles::Article.new({url: post.css("a").attribute("href").value, title: post.css("h2").text, categories: [category]})
|
15
|
+
end # do end
|
16
|
+
i += 1
|
17
|
+
end # until loop end
|
18
|
+
nil
|
19
|
+
end # def end
|
20
|
+
|
21
|
+
# This method takes an article URL, creates an article hash then populates that article hash using information scraped from the URL. The method then returns the article hash.
|
22
|
+
def self.scrape_article(url)
|
23
|
+
article = {}
|
24
|
+
doc = Nokogiri::HTML(open(url))
|
25
|
+
article[:title] = doc.css("h1").text
|
26
|
+
article[:author] = doc.css(".staff-info h2").text
|
27
|
+
article[:url] = url
|
28
|
+
article[:blurb] = doc.css(".entry p").text[0,400]
|
29
|
+
categories = []
|
30
|
+
doc.css(".categories a").each do |link|
|
31
|
+
categories << link.attribute("href").value.split("/")[-1]
|
32
|
+
end # do end
|
33
|
+
article[:categories] = categories
|
34
|
+
article
|
35
|
+
end # returns hash of information on the article.
|
36
|
+
|
37
|
+
# This method takes in a URL of a list page of essays at APW and creates an array of all link attributes on the essay links in the list. It then breaks apart the string of link attributes and returns an array of those attributes with the preface "category-",
|
38
|
+
def self.scrape_categories(url = "https://apracticalwedding.com/category/marriage-essays/?listas=list")
|
39
|
+
doc = Nokogiri::HTML(open(url)).css(".type-post")
|
40
|
+
link_attributes = []
|
41
|
+
categories = []
|
42
|
+
doc.each { |link| link_attributes << link.attribute("class").value }
|
43
|
+
link_attributes.each do |attributes_list|
|
44
|
+
attributes_array = attributes_list.split(/ category-/)
|
45
|
+
attributes_array.slice!(0)
|
46
|
+
attributes_array.each do |category|
|
47
|
+
categories << category.split[0]
|
48
|
+
end # attributes_array do end
|
49
|
+
end # link_attributes do end
|
50
|
+
categories.uniq
|
51
|
+
end # self.scrape_categories end
|
52
|
+
|
53
|
+
end
|
metadata
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: apw_articles
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Rachel Walwood
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-03-21 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A Ruby Gem to explore articles on the A Practical Wedding website that
|
14
|
+
are useful after the wedding.
|
15
|
+
email: walwoodr@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- bin/apw_articles
|
21
|
+
- config/environment.rb
|
22
|
+
- lib/apw_articles.rb
|
23
|
+
- lib/apw_articles/article.rb
|
24
|
+
- lib/apw_articles/category.rb
|
25
|
+
- lib/apw_articles/cli.rb
|
26
|
+
- lib/apw_articles/scraper.rb
|
27
|
+
homepage: http://rubygems.org/gems/apw_articles
|
28
|
+
licenses:
|
29
|
+
- MIT
|
30
|
+
metadata: {}
|
31
|
+
post_install_message:
|
32
|
+
rdoc_options: []
|
33
|
+
require_paths:
|
34
|
+
- lib
|
35
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: '0'
|
45
|
+
requirements: []
|
46
|
+
rubyforge_project:
|
47
|
+
rubygems_version: 2.6.10
|
48
|
+
signing_key:
|
49
|
+
specification_version: 4
|
50
|
+
summary: Articles from A Practical Wedding
|
51
|
+
test_files: []
|