effscraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/effscraper.rb +67 -0
  2. metadata +47 -0
data/lib/effscraper.rb ADDED
@@ -0,0 +1,67 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'json'
4
+ require 'uploadconvert'
5
+
6
+ class EFFScraper
7
+ def initialize(url)
8
+ @url = url
9
+ @casearray = Array.new
10
+ end
11
+
12
+ # Scrapes all documents in case
13
+ def scrapeCase
14
+ html = Nokogiri::HTML(open(@url))
15
+
16
+ # Get number of pages to scrape
17
+ if html.css("li.pager-current")[0]
18
+ count = html.css("li.pager-current")[0].text.split(" ")
19
+ n = count[2].to_i
20
+ else
21
+ n = 1
22
+ end
23
+
24
+ # Go through pages and scrape them
25
+ for i in 1..n
26
+ if i > 1
27
+ link = "https://eff.org" + html.css("li.pager-next")[0].css("a")[0]["href"]
28
+ html = Nokogiri::HTML(open(link))
29
+ end
30
+
31
+ scrapePage(html)
32
+ end
33
+
34
+ JSON.pretty_generate(@casearray)
35
+ end
36
+
37
+ # Scrapes each page of documents
38
+ def scrapePage(html)
39
+ items = html.css("div.view-content")[0]
40
+
41
+ items.css("li").each do |l|
42
+ dochash = Hash.new
43
+
44
+ # Gets link to document and file
45
+ l.css("a").each do |a|
46
+ if a.text == "[PDF]"
47
+ dochash[:url] = a["href"]
48
+ `wget #{dochash[:url]}`
49
+ path = dochash[:url].split("/")
50
+ dochash[:path] = path[path.length-1].chomp.strip
51
+ end
52
+ end
53
+ print
54
+
55
+ # Get date and title
56
+ dochash[:doc_date] = l.css("span.date-display-single").text
57
+ dochash[:title] = l.css("a")[1].text
58
+
59
+ # Extract metadata and text
60
+ u = UploadConvert.new(dochash[:path])
61
+ metadata = u.extractMetadataPDF
62
+ metadata.each{|k, v| dochash[k] = v}
63
+ dochash[:text] = u.detectPDFType
64
+ @casearray.push(dochash)
65
+ end
66
+ end
67
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: effscraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - M. C. McGrath
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-03-13 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Scrapes EFF court documents then extracts the plaintext and metadata.
15
+ email: shidash@shidash.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/effscraper.rb
21
+ homepage: https://github.com/Shidash/EFFScraper
22
+ licenses:
23
+ - GPL
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.23
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: Scrapes, extracts text, extracts metadata from EFF court documents
46
+ test_files: []
47
+ has_rdoc: