guardianscraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/guardianscraper.rb +36 -0
  3. metadata +45 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7844bf4e98d141cf05ae449a8cf20dcaf99cadc9
4
+ data.tar.gz: 462c193afcfec2d3535bd585d4256f35f393982a
5
+ SHA512:
6
+ metadata.gz: b5ae4e816de1b2efaea18456bf53d3d1587561d82bc086e31decfb8c0e88d9a9d96e3d8567bd886ae75301cb7e9505910163af3ce692ed64a1b58e0855736992
7
+ data.tar.gz: 78071d2df2b3fc6a42d36d6ceb064b7279c51b5d02ac16bdc1c5408218f514bbb126c96e31124c38fde9504600c9678a4b66d5ef0714ccc888808293dead1c99
@@ -0,0 +1,36 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'json'
4
+
5
+ # Scraper for the articles about the NSA docs in the Guardian
6
+ class GuardianScraper
7
+ def initialize(url)
8
+ @url = url
9
+ end
10
+
11
+ # Download the article and save the text and other data
12
+ def getArticle
13
+ articlehash = Hash.new
14
+ html = Nokogiri::HTML(open(@url))
15
+
16
+ # Gets misc data on article
17
+ articlehash[:headline] = html.css('h1[itemprop="name headline "]').text
18
+ articlehash[:description] = html.css('div[itemprop="description"]').text
19
+ articlehash[:date] = html.css('time[itemprop="datePublished"]').text
20
+ articlehash[:author] = html.css("a.contributor").text
21
+ articlehash[:published_by] = "The Guardian"
22
+ articlehash[:caption] = html.css("div.caption").text
23
+
24
+ # Gets list of documents linked to
25
+ articlehash[:documents] = Array.new
26
+ html.css('div[itemprop="description"]').css("a").each do |d|
27
+ articlehash[:documents].push(d["href"])
28
+ end
29
+
30
+ # Gets text of article
31
+ articlehash[:text] = html.css("div#article-body-blocks").text
32
+ articlehash[:plaintext] = html.css("div#article-body-blocks").text
33
+
34
+ JSON.pretty_generate(articlehash)
35
+ end
36
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: guardianscraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-04-29 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Scrapes Guardian articles.
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/guardianscraper.rb
20
+ homepage: https://github.com/Shidash/GuardianScraper
21
+ licenses:
22
+ - GPL
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.0.14
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Scrapes Guardian articles
44
+ test_files: []
45
+ has_rdoc: