effscraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/effscraper.rb +67 -0
  2. metadata +47 -0
data/lib/effscraper.rb ADDED
@@ -0,0 +1,67 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'json'
4
+ require 'uploadconvert'
5
+
6
+ class EFFScraper
7
+ def initialize(url)
8
+ @url = url
9
+ @casearray = Array.new
10
+ end
11
+
12
+ # Scrapes all documents in case
13
+ def scrapeCase
14
+ html = Nokogiri::HTML(open(@url))
15
+
16
+ # Get number of pages to scrape
17
+ if html.css("li.pager-current")[0]
18
+ count = html.css("li.pager-current")[0].text.split(" ")
19
+ n = count[2].to_i
20
+ else
21
+ n = 1
22
+ end
23
+
24
+ # Go through pages and scrape them
25
+ for i in 1..n
26
+ if i > 1
27
+ link = "https://eff.org" + html.css("li.pager-next")[0].css("a")[0]["href"]
28
+ html = Nokogiri::HTML(open(link))
29
+ end
30
+
31
+ scrapePage(html)
32
+ end
33
+
34
+ JSON.pretty_generate(@casearray)
35
+ end
36
+
37
+ # Scrapes each page of documents
38
+ def scrapePage(html)
39
+ items = html.css("div.view-content")[0]
40
+
41
+ items.css("li").each do |l|
42
+ dochash = Hash.new
43
+
44
+ # Gets link to document and file
45
+ l.css("a").each do |a|
46
+ if a.text == "[PDF]"
47
+ dochash[:url] = a["href"]
48
+ `wget #{dochash[:url]}`
49
+ path = dochash[:url].split("/")
50
+ dochash[:path] = path[path.length-1].chomp.strip
51
+ end
52
+ end
53
+ print
54
+
55
+ # Get date and title
56
+ dochash[:doc_date] = l.css("span.date-display-single").text
57
+ dochash[:title] = l.css("a")[1].text
58
+
59
+ # Extract metadata and text
60
+ u = UploadConvert.new(dochash[:path])
61
+ metadata = u.extractMetadataPDF
62
+ metadata.each{|k, v| dochash[k] = v}
63
+ dochash[:text] = u.detectPDFType
64
+ @casearray.push(dochash)
65
+ end
66
+ end
67
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: effscraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - M. C. McGrath
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-03-13 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Scrapes EFF court documents then extracts the plaintext and metadata.
15
+ email: shidash@shidash.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/effscraper.rb
21
+ homepage: https://github.com/Shidash/EFFScraper
22
+ licenses:
23
+ - GPL
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.23
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: Scrapes, extracts text, extracts metadata from EFF court documents
46
+ test_files: []
47
+ has_rdoc: