guardianscraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/guardianscraper.rb +36 -0
  3. metadata +45 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7844bf4e98d141cf05ae449a8cf20dcaf99cadc9
4
+ data.tar.gz: 462c193afcfec2d3535bd585d4256f35f393982a
5
+ SHA512:
6
+ metadata.gz: b5ae4e816de1b2efaea18456bf53d3d1587561d82bc086e31decfb8c0e88d9a9d96e3d8567bd886ae75301cb7e9505910163af3ce692ed64a1b58e0855736992
7
+ data.tar.gz: 78071d2df2b3fc6a42d36d6ceb064b7279c51b5d02ac16bdc1c5408218f514bbb126c96e31124c38fde9504600c9678a4b66d5ef0714ccc888808293dead1c99
@@ -0,0 +1,36 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'json'
4
+
5
+ # Scraper for the articles about the NSA docs in the Guardian
6
+ class GuardianScraper
7
+ def initialize(url)
8
+ @url = url
9
+ end
10
+
11
+ # Download the article and save the text and other data
12
+ def getArticle
13
+ articlehash = Hash.new
14
+ html = Nokogiri::HTML(open(@url))
15
+
16
+ # Gets misc data on article
17
+ articlehash[:headline] = html.css('h1[itemprop="name headline "]').text
18
+ articlehash[:description] = html.css('div[itemprop="description"]').text
19
+ articlehash[:date] = html.css('time[itemprop="datePublished"]').text
20
+ articlehash[:author] = html.css("a.contributor").text
21
+ articlehash[:published_by] = "The Guardian"
22
+ articlehash[:caption] = html.css("div.caption").text
23
+
24
+ # Gets list of documents linked to
25
+ articlehash[:documents] = Array.new
26
+ html.css('div[itemprop="description"]').css("a").each do |d|
27
+ articlehash[:documents].push(d["href"])
28
+ end
29
+
30
+ # Gets text of article
31
+ articlehash[:text] = html.css("div#article-body-blocks").text
32
+ articlehash[:plaintext] = html.css("div#article-body-blocks").text
33
+
34
+ JSON.pretty_generate(articlehash)
35
+ end
36
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: guardianscraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-04-29 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Scrapes Guardian articles.
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/guardianscraper.rb
20
+ homepage: https://github.com/Shidash/GuardianScraper
21
+ licenses:
22
+ - GPL
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.0.14
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Scrapes Guardian articles
44
+ test_files: []
45
+ has_rdoc: