wpcrawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/wpcrawler.rb +40 -0
  3. metadata +44 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 319765da5c2b1b0f233b815b4d9165a6929934c376f40b8b9233b6544c357cd8
4
+ data.tar.gz: 8d6388658c421dd5064f849acc68e4825193799e544aa29124bb2be29e1c4cd6
5
+ SHA512:
6
+ metadata.gz: c715ceaeee1b2763e20096551d463e9e2d660a7692c085a8fc7d28d41cce82fd8ad6c955731ffdb64c23bbce6a3b8469a259b528fd7ef42174d9f1f859c3d1a6
7
+ data.tar.gz: eacddd6bad4afd305dd708cd5809d23ec456139b8ba1d2b9d96d86657e6ba66d7872363145293faced26f7345def00378d77a9264e644dcc260d3fbf47558c7d
data/lib/wpcrawler.rb ADDED
@@ -0,0 +1,40 @@
1
+ require 'JSON'
2
+ require 'http'
3
+ require 'tty-spinner'
4
+
5
+ class Crawler
6
+ attr_accessor :url
7
+
8
+ def initialize(url, type)
9
+ @url = url
10
+ @type = type
11
+ end
12
+
13
+ def scrape
14
+ begin
15
+ spinner = TTY::Spinner.new("[:spinner]", format: :dots)
16
+ spinner.auto_spin # Automatic animation with default interval
17
+ output = File.open( "wpoutputfile.csv","w+" )
18
+ output.puts "Date | Title | Author | Link | Status"
19
+ (1..8).each do |n|
20
+ response = HTTP.timeout(5).get("https://" + @url.to_s + "/wp-json/wp/v2/#{@type}/" +"?page=#{n}&per_page=100").to_s
21
+ JSON.parse(response).each do |e|
22
+ output.puts "#{e.fetch('date')}" + "| #{e&.dig('title','rendered')}" + "| #{e.fetch('author')}"+ "| #{e.dig('link')}"+ "| #{e.dig('status')}"
23
+ end rescue TypeError
24
+ end
25
+ output.close
26
+ spinner.stop('Done!') # Stop animation
27
+ rescue HTTP::TimeoutError
28
+ puts "Connection ERROR - make sure your website is a wordpress site with an open api"
29
+ end
30
+ end
31
+ end
32
+
33
+ site = Crawler.new(ARGV[0], ARGV[1])
34
+
35
+ if ARGV[1] == "pages" or ARGV[1] == "posts"
36
+ site.scrape
37
+ else
38
+ puts "ERROR: type needs to be pages or posts"
39
+ end
40
+
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wpcrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Duarte Martins
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-01-22 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A simple crawler that gets posts and pages from wordpress websites that
14
+ have an exposed api
15
+ email:
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/wpcrawler.rb
21
+ homepage: https://rubygems.org/gems/wpcrawler
22
+ licenses:
23
+ - MIT
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubygems_version: 3.1.2
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Simple wordpress crawler
44
+ test_files: []