wpcrawler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/wpcrawler.rb +40 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 319765da5c2b1b0f233b815b4d9165a6929934c376f40b8b9233b6544c357cd8
|
4
|
+
data.tar.gz: 8d6388658c421dd5064f849acc68e4825193799e544aa29124bb2be29e1c4cd6
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c715ceaeee1b2763e20096551d463e9e2d660a7692c085a8fc7d28d41cce82fd8ad6c955731ffdb64c23bbce6a3b8469a259b528fd7ef42174d9f1f859c3d1a6
|
7
|
+
data.tar.gz: eacddd6bad4afd305dd708cd5809d23ec456139b8ba1d2b9d96d86657e6ba66d7872363145293faced26f7345def00378d77a9264e644dcc260d3fbf47558c7d
|
data/lib/wpcrawler.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'JSON'
|
2
|
+
require 'http'
|
3
|
+
require 'tty-spinner'
|
4
|
+
|
5
|
+
class Crawler
|
6
|
+
attr_accessor :url
|
7
|
+
|
8
|
+
def initialize(url, type)
|
9
|
+
@url = url
|
10
|
+
@type = type
|
11
|
+
end
|
12
|
+
|
13
|
+
def scrape
|
14
|
+
begin
|
15
|
+
spinner = TTY::Spinner.new("[:spinner]", format: :dots)
|
16
|
+
spinner.auto_spin # Automatic animation with default interval
|
17
|
+
output = File.open( "wpoutputfile.csv","w+" )
|
18
|
+
output.puts "Date | Title | Author | Link | Status"
|
19
|
+
(1..8).each do |n|
|
20
|
+
response = HTTP.timeout(5).get("https://" + @url.to_s + "/wp-json/wp/v2/#{@type}/" +"?page=#{n}&per_page=100").to_s
|
21
|
+
JSON.parse(response).each do |e|
|
22
|
+
output.puts "#{e.fetch('date')}" + "| #{e&.dig('title','rendered')}" + "| #{e.fetch('author')}"+ "| #{e.dig('link')}"+ "| #{e.dig('status')}"
|
23
|
+
end rescue TypeError
|
24
|
+
end
|
25
|
+
output.close
|
26
|
+
spinner.stop('Done!') # Stop animation
|
27
|
+
rescue HTTP::TimeoutError
|
28
|
+
puts "Connection ERROR - make sure your website is a wordpress site with an open api"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
site = Crawler.new(ARGV[0], ARGV[1])
|
34
|
+
|
35
|
+
if ARGV[1] == "pages" or ARGV[1] == "posts"
|
36
|
+
site.scrape
|
37
|
+
else
|
38
|
+
puts "ERROR: type needs to be pages or posts"
|
39
|
+
end
|
40
|
+
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wpcrawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Duarte Martins
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-01-22 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A simple crawler that gets posts and pages from wordpress websites that
|
14
|
+
have an exposed api
|
15
|
+
email:
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/wpcrawler.rb
|
21
|
+
homepage: https://rubygems.org/gems/wpcrawler
|
22
|
+
licenses:
|
23
|
+
- MIT
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubygems_version: 3.1.2
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Simple wordpress crawler
|
44
|
+
test_files: []
|