effscraper 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/effscraper.rb +67 -0
- metadata +47 -0
data/lib/effscraper.rb
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'json'
|
4
|
+
require 'uploadconvert'
|
5
|
+
|
6
|
+
class EFFScraper
|
7
|
+
def initialize(url)
|
8
|
+
@url = url
|
9
|
+
@casearray = Array.new
|
10
|
+
end
|
11
|
+
|
12
|
+
# Scrapes all documents in case
|
13
|
+
def scrapeCase
|
14
|
+
html = Nokogiri::HTML(open(@url))
|
15
|
+
|
16
|
+
# Get number of pages to scrape
|
17
|
+
if html.css("li.pager-current")[0]
|
18
|
+
count = html.css("li.pager-current")[0].text.split(" ")
|
19
|
+
n = count[2].to_i
|
20
|
+
else
|
21
|
+
n = 1
|
22
|
+
end
|
23
|
+
|
24
|
+
# Go through pages and scrape them
|
25
|
+
for i in 1..n
|
26
|
+
if i > 1
|
27
|
+
link = "https://eff.org" + html.css("li.pager-next")[0].css("a")[0]["href"]
|
28
|
+
html = Nokogiri::HTML(open(link))
|
29
|
+
end
|
30
|
+
|
31
|
+
scrapePage(html)
|
32
|
+
end
|
33
|
+
|
34
|
+
JSON.pretty_generate(@casearray)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Scrapes each page of documents
|
38
|
+
def scrapePage(html)
|
39
|
+
items = html.css("div.view-content")[0]
|
40
|
+
|
41
|
+
items.css("li").each do |l|
|
42
|
+
dochash = Hash.new
|
43
|
+
|
44
|
+
# Gets link to document and file
|
45
|
+
l.css("a").each do |a|
|
46
|
+
if a.text == "[PDF]"
|
47
|
+
dochash[:url] = a["href"]
|
48
|
+
`wget #{dochash[:url]}`
|
49
|
+
path = dochash[:url].split("/")
|
50
|
+
dochash[:path] = path[path.length-1].chomp.strip
|
51
|
+
end
|
52
|
+
end
|
53
|
+
print
|
54
|
+
|
55
|
+
# Get date and title
|
56
|
+
dochash[:doc_date] = l.css("span.date-display-single").text
|
57
|
+
dochash[:title] = l.css("a")[1].text
|
58
|
+
|
59
|
+
# Extract metadata and text
|
60
|
+
u = UploadConvert.new(dochash[:path])
|
61
|
+
metadata = u.extractMetadataPDF
|
62
|
+
metadata.each{|k, v| dochash[k] = v}
|
63
|
+
dochash[:text] = u.detectPDFType
|
64
|
+
@casearray.push(dochash)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: effscraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- M. C. McGrath
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-03-13 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Scrapes EFF court documents then extracts the plaintext and metadata.
|
15
|
+
email: shidash@shidash.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/effscraper.rb
|
21
|
+
homepage: https://github.com/Shidash/EFFScraper
|
22
|
+
licenses:
|
23
|
+
- GPL
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
none: false
|
36
|
+
requirements:
|
37
|
+
- - ! '>='
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
requirements: []
|
41
|
+
rubyforge_project:
|
42
|
+
rubygems_version: 1.8.23
|
43
|
+
signing_key:
|
44
|
+
specification_version: 3
|
45
|
+
summary: Scrapes, extracts text, extracts metadata from EFF court documents
|
46
|
+
test_files: []
|
47
|
+
has_rdoc:
|