wpcrawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/wpcrawler.rb +40 -0
  3. metadata +44 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 319765da5c2b1b0f233b815b4d9165a6929934c376f40b8b9233b6544c357cd8
4
+ data.tar.gz: 8d6388658c421dd5064f849acc68e4825193799e544aa29124bb2be29e1c4cd6
5
+ SHA512:
6
+ metadata.gz: c715ceaeee1b2763e20096551d463e9e2d660a7692c085a8fc7d28d41cce82fd8ad6c955731ffdb64c23bbce6a3b8469a259b528fd7ef42174d9f1f859c3d1a6
7
+ data.tar.gz: eacddd6bad4afd305dd708cd5809d23ec456139b8ba1d2b9d96d86657e6ba66d7872363145293faced26f7345def00378d77a9264e644dcc260d3fbf47558c7d
data/lib/wpcrawler.rb ADDED
@@ -0,0 +1,40 @@
1
+ require 'JSON'
2
+ require 'http'
3
+ require 'tty-spinner'
4
+
5
+ class Crawler
6
+ attr_accessor :url
7
+
8
+ def initialize(url, type)
9
+ @url = url
10
+ @type = type
11
+ end
12
+
13
+ def scrape
14
+ begin
15
+ spinner = TTY::Spinner.new("[:spinner]", format: :dots)
16
+ spinner.auto_spin # Automatic animation with default interval
17
+ output = File.open( "wpoutputfile.csv","w+" )
18
+ output.puts "Date | Title | Author | Link | Status"
19
+ (1..8).each do |n|
20
+ response = HTTP.timeout(5).get("https://" + @url.to_s + "/wp-json/wp/v2/#{@type}/" +"?page=#{n}&per_page=100").to_s
21
+ JSON.parse(response).each do |e|
22
+ output.puts "#{e.fetch('date')}" + "| #{e&.dig('title','rendered')}" + "| #{e.fetch('author')}"+ "| #{e.dig('link')}"+ "| #{e.dig('status')}"
23
+ end rescue TypeError
24
+ end
25
+ output.close
26
+ spinner.stop('Done!') # Stop animation
27
+ rescue HTTP::TimeoutError
28
+ puts "Connection ERROR - make sure your website is a wordpress site with an open api"
29
+ end
30
+ end
31
+ end
32
+
33
+ site = Crawler.new(ARGV[0], ARGV[1])
34
+
35
+ if ARGV[1] == "pages" or ARGV[1] == "posts"
36
+ site.scrape
37
+ else
38
+ puts "ERROR: type needs to be pages or posts"
39
+ end
40
+
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wpcrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Duarte Martins
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-01-22 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A simple crawler that gets posts and pages from wordpress websites that
14
+ have an exposed api
15
+ email:
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/wpcrawler.rb
21
+ homepage: https://rubygems.org/gems/wpcrawler
22
+ licenses:
23
+ - MIT
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubygems_version: 3.1.2
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Simple wordpress crawler
44
+ test_files: []