ecfs 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/ecfs/filings_query.rb +11 -19
- data/lib/ecfs/proceeding.rb +1 -1
- data/lib/ecfs/proceedings_query.rb +110 -58
- data/lib/ecfs/spreadsheet_parser.rb +54 -38
- data/lib/ecfs/util.rb +6 -1
- data/lib/ecfs/version.rb +1 -1
- data/test/test_filing.rb +4 -0
- data/test/test_proceeding.rb +1 -0
- metadata +2 -2
data/lib/ecfs/filings_query.rb
CHANGED
@@ -54,14 +54,8 @@ module ECFS
|
|
54
54
|
end
|
55
55
|
|
56
56
|
def get
|
57
|
-
|
58
|
-
|
59
|
-
return rows.map do |row|
|
60
|
-
row_to_filing(row)
|
61
|
-
end
|
62
|
-
else
|
63
|
-
return rows
|
64
|
-
end
|
57
|
+
download_spreadsheet!
|
58
|
+
@typecast_results ? @rows.map {|row| row_to_filing(row)} : @rows
|
65
59
|
end
|
66
60
|
|
67
61
|
def row_to_filing(row)
|
@@ -69,20 +63,18 @@ module ECFS
|
|
69
63
|
end
|
70
64
|
|
71
65
|
def mechanize_agent
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
agent
|
66
|
+
Mechanize.new.tap do |agent|
|
67
|
+
agent.follow_meta_refresh = true
|
68
|
+
agent.pluggable_parser["application/vnd.ms-excel"] = ECFS::SpreadsheetParser
|
69
|
+
end
|
77
70
|
end
|
78
71
|
|
79
|
-
def download_spreadsheet
|
80
|
-
agent =
|
81
|
-
page = agent.get(self.url)
|
72
|
+
def download_spreadsheet!
|
73
|
+
agent = mechanize_agent
|
82
74
|
link_text = "\r\n \t \t \tExport to Excel file\r\n \t \t"
|
83
|
-
link =
|
84
|
-
|
85
|
-
agent.click(link)
|
75
|
+
link = agent.get(url).link_with(:text => link_text)
|
76
|
+
|
77
|
+
@rows = agent.click(link).rows
|
86
78
|
end
|
87
79
|
end
|
88
80
|
end
|
data/lib/ecfs/proceeding.rb
CHANGED
@@ -51,8 +51,8 @@ module ECFS
|
|
51
51
|
# if docket_number is given along with other constraints, the other constraints will be ignored.
|
52
52
|
warn "Constraints other than `docket_number` will be ignored." if @constraints.keys.length > 1
|
53
53
|
|
54
|
-
return
|
55
|
-
results = ECFS::Proceeding.new(
|
54
|
+
return scrape_proceeding_page! unless @typecast_results
|
55
|
+
results = ECFS::Proceeding.new(scrape_proceeding_page!)
|
56
56
|
else
|
57
57
|
return scrape_results_page unless @typecast_results
|
58
58
|
results = ECFS::Proceeding::ResultSet.new(scrape_results_page)
|
@@ -61,85 +61,136 @@ module ECFS
|
|
61
61
|
results
|
62
62
|
end
|
63
63
|
|
64
|
-
def mechanize_page
|
65
|
-
Mechanize.new.get(self.url)
|
66
|
-
end
|
67
64
|
|
68
65
|
private
|
69
66
|
|
70
|
-
def
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
if search.length > 0
|
83
|
-
key = search.first.children.first.text.lstrip.rstrip.split(":")[0].gsub(" ", "_").downcase
|
84
|
-
pair << key
|
85
|
-
else
|
86
|
-
value = span.text.lstrip.rstrip
|
87
|
-
value.gsub!(",", "") if value.is_a?(String)
|
88
|
-
pair << value
|
89
|
-
end
|
90
|
-
container << pair
|
67
|
+
def mechanize_page
|
68
|
+
Mechanize.new.get(url)
|
69
|
+
end
|
70
|
+
|
71
|
+
def scrape_proceeding_page!
|
72
|
+
container_to_hash do
|
73
|
+
mechanize_page.search("div").select do |div|
|
74
|
+
div.attributes["class"].nil? == false
|
75
|
+
end.select do |div|
|
76
|
+
div.attributes["class"].text == "wwgrp"
|
77
|
+
end.map do |node|
|
78
|
+
search_node(node)
|
91
79
|
end
|
92
80
|
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def container_to_hash(&block)
|
93
84
|
hash = {}
|
94
|
-
|
95
|
-
hash
|
85
|
+
block.call.flatten!.each_slice(2) do |chunk|
|
86
|
+
hash[chunk[0]] = chunk[1]
|
96
87
|
end
|
97
88
|
|
98
|
-
hash["date_created"] = format_date(hash["date_created"])
|
99
|
-
|
100
89
|
hash
|
101
90
|
end
|
102
91
|
|
103
|
-
def
|
104
|
-
|
92
|
+
def search_node(node)
|
93
|
+
node.search("span").map do |span|
|
94
|
+
search = span.search("label")
|
95
|
+
key_or_value_from_search_and_span(search, span)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def key_or_value_from_search_and_span(search, span)
|
100
|
+
search.length > 0 ? key_from_search(search) : value_from_span(span)
|
101
|
+
end
|
102
|
+
|
103
|
+
def key_from_search(search)
|
104
|
+
format_key_text(search.first.children.first.text)
|
105
|
+
end
|
106
|
+
|
107
|
+
def format_key_text(key_text)
|
108
|
+
key_text.lstrip!.rstrip!
|
109
|
+
key_text = key_text.split(":")[0]
|
110
|
+
key_text.gsub!(" ", "_")
|
111
|
+
key_text.downcase!
|
112
|
+
end
|
113
|
+
|
114
|
+
def value_from_span(span)
|
115
|
+
value = text_from_span(span)
|
116
|
+
value.gsub!(",", "") if value.is_a?(String)
|
117
|
+
|
118
|
+
value
|
119
|
+
end
|
120
|
+
|
121
|
+
def text_from_span(span)
|
122
|
+
span.text.lstrip.rstrip
|
123
|
+
end
|
105
124
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
last = banner[3].gsub(",","").to_i
|
110
|
-
total = banner[5].gsub(",","").to_i
|
111
|
-
table_rows = page.search("//*[@id='yui-main']/div/div[2]/table/tbody/tr[2]/td/table/tbody").children
|
112
|
-
results = table_rows.map { |row| row_to_proceeding(row) }
|
125
|
+
def scrape_results_page
|
126
|
+
page = mechanize_page
|
127
|
+
banner = extract_banner_from_page(page)
|
113
128
|
|
114
129
|
{
|
115
130
|
"constraints" => @constraints,
|
116
|
-
"fcc_url" =>
|
117
|
-
"current_page" =>
|
118
|
-
"total_pages" =>
|
119
|
-
"first_result" =>
|
120
|
-
"last_result" =>
|
121
|
-
"total_results" =>
|
122
|
-
"results" =>
|
131
|
+
"fcc_url" => url,
|
132
|
+
"current_page" => current_page,
|
133
|
+
"total_pages" => total_pages_from_page(page),
|
134
|
+
"first_result" => first_from_banner(banner),
|
135
|
+
"last_result" => last_from_banner(banner),
|
136
|
+
"total_results" => total_from_banner(banner),
|
137
|
+
"results" => proceedings_from_page(page)
|
123
138
|
}
|
124
139
|
end
|
125
140
|
|
126
|
-
def
|
127
|
-
|
141
|
+
def current_page
|
142
|
+
self.constraints["page_number"].gsub(",","").to_i
|
143
|
+
end
|
128
144
|
|
129
|
-
|
145
|
+
def proceedings_from_page(page)
|
146
|
+
extract_table_rows_from_page(page).map do |row|
|
147
|
+
row_to_proceeding(row)
|
148
|
+
end
|
130
149
|
end
|
131
150
|
|
132
|
-
def
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
151
|
+
def extract_table_rows_from_page(page)
|
152
|
+
xpath = "//*[@id='yui-main']/div/div[2]/table/tbody/tr[2]/td/table/tbody"
|
153
|
+
page.search(xpath).children
|
154
|
+
end
|
155
|
+
|
156
|
+
def first_from_banner(banner)
|
157
|
+
extract_from_banner(banner, 1)
|
158
|
+
end
|
159
|
+
|
160
|
+
def last_from_banner(banner)
|
161
|
+
extract_from_banner(banner, 3)
|
162
|
+
end
|
163
|
+
|
164
|
+
def total_from_banner(banner)
|
165
|
+
extract_from_banner(banner, 5)
|
166
|
+
end
|
167
|
+
|
168
|
+
def extract_banner_from_page(page)
|
169
|
+
xpath = "//*[@id='yui-main']/div/div[2]/table/tbody/tr[2]/td/span[1]"
|
170
|
+
page.search(xpath).text.tap do |t|
|
171
|
+
t.lstrip!
|
172
|
+
t.rstrip!
|
173
|
+
end.split("Modify Search")[0].rstrip.split
|
174
|
+
end
|
137
175
|
|
176
|
+
def extract_from_banner(banner, index)
|
177
|
+
banner[index].gsub(",", "").to_i
|
178
|
+
end
|
179
|
+
|
180
|
+
def total_pages_from_page(page)
|
181
|
+
page.link_with(:text => "Last").attributes.first[1].split("pageNumber=")[1].gsub(",","").to_i
|
182
|
+
end
|
183
|
+
|
184
|
+
def row_to_proceeding(row)
|
185
|
+
ECFS::Proceeding.new(row_to_hash(row))
|
186
|
+
end
|
187
|
+
|
188
|
+
def row_to_hash(row)
|
138
189
|
{
|
139
|
-
"docket_number" =>
|
140
|
-
"bureau" =>
|
141
|
-
"subject" =>
|
142
|
-
"filings_in_last_30_days" =>
|
190
|
+
"docket_number" => docket_number_from_row(row),
|
191
|
+
"bureau" => bureau_from_row(row),
|
192
|
+
"subject" => subject_from_row(row),
|
193
|
+
"filings_in_last_30_days" => filings_in_last_30_days_from_row(row)
|
143
194
|
}
|
144
195
|
end
|
145
196
|
|
@@ -158,6 +209,7 @@ module ECFS
|
|
158
209
|
def filings_in_last_30_days_from_row(row)
|
159
210
|
row.children[6].children.first.text.lstrip.rstrip.to_i
|
160
211
|
end
|
212
|
+
#####
|
161
213
|
|
162
214
|
end
|
163
215
|
end
|
@@ -11,62 +11,78 @@ module ECFS
|
|
11
11
|
def initialize(uri = nil, response = nil, body = nil, code = nil)
|
12
12
|
super(uri, response, body, code)
|
13
13
|
@body = body
|
14
|
-
|
15
|
-
format_rows!
|
14
|
+
@rows = formatted_rows
|
16
15
|
end
|
17
16
|
|
18
17
|
private
|
19
18
|
|
20
|
-
def
|
21
|
-
|
22
|
-
|
23
|
-
|
19
|
+
def file
|
20
|
+
StringIO.new(@body)
|
21
|
+
end
|
22
|
+
|
23
|
+
def book
|
24
|
+
Spreadsheet.open(file)
|
25
|
+
end
|
26
|
+
|
27
|
+
def sheet
|
28
|
+
book.worksheet(0)
|
29
|
+
end
|
30
|
+
|
31
|
+
def unformatted_rows
|
32
|
+
my_rows = []
|
24
33
|
first = false
|
25
|
-
|
26
|
-
|
34
|
+
sheet.each do |row|
|
35
|
+
my_rows << row if first
|
27
36
|
first = true
|
28
37
|
end
|
29
38
|
|
30
|
-
|
39
|
+
my_rows
|
31
40
|
end
|
32
41
|
|
33
|
-
def
|
34
|
-
|
35
|
-
|
36
|
-
indices = (7..row.length-1).to_a
|
37
|
-
indices.each do |i|
|
38
|
-
text = row[i].data.split("id=")[1]
|
39
|
-
urls << "http://apps.fcc.gov/ecfs/document/view?id=#{extract_filing_id(text)}"
|
40
|
-
end
|
41
|
-
|
42
|
-
{
|
43
|
-
"name_of_filer" => row[1],
|
44
|
-
"docket_number" => row[0],
|
45
|
-
"lawfirm_name" => row[2],
|
46
|
-
"date_received" => format_date(row[3]),
|
47
|
-
"date_posted" => format_date(row[4]),
|
48
|
-
"exparte" => format_exparte(row[5]),
|
49
|
-
"type_of_filing" => row[6],
|
50
|
-
"document_urls" => urls
|
51
|
-
}
|
42
|
+
def formatted_rows
|
43
|
+
unformatted_rows.map do |row|
|
44
|
+
row_to_hash(row)
|
52
45
|
end
|
53
46
|
end
|
54
47
|
|
55
|
-
def
|
56
|
-
|
57
|
-
|
58
|
-
|
48
|
+
def row_to_hash(row)
|
49
|
+
{
|
50
|
+
"name_of_filer" => row[1],
|
51
|
+
"docket_number" => row[0],
|
52
|
+
"lawfirm_name" => row[2],
|
53
|
+
"date_received" => format_iso_date(row[3]),
|
54
|
+
"date_posted" => format_iso_date(row[4]),
|
55
|
+
"exparte" => bool_from_exparte(row[5]),
|
56
|
+
"type_of_filing" => row[6],
|
57
|
+
"document_urls" => urls_from_row(row)
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
def urls_from_row(row)
|
62
|
+
indices = (7..row.length-1).to_a
|
63
|
+
|
64
|
+
indices.map do |index|
|
65
|
+
extract_url_from_row_and_index(row, index)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def extract_url_from_row_and_index(row, index)
|
70
|
+
text = row[index].data.split("id=")[1]
|
71
|
+
|
72
|
+
"http://apps.fcc.gov/ecfs/document/view?id=#{extract_filing_id(text)}"
|
59
73
|
end
|
60
74
|
|
61
75
|
def extract_filing_id(txt)
|
62
76
|
re1='(\\d+)'
|
63
77
|
re=(re1)
|
64
78
|
m = Regexp.new(re, Regexp::IGNORECASE)
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
79
|
+
|
80
|
+
m.match(txt)[1]
|
81
|
+
end
|
82
|
+
|
83
|
+
def bool_from_exparte(y_or_n)
|
84
|
+
{"Y" => true, "N" => false}[y_or_n]
|
69
85
|
end
|
70
86
|
|
71
|
-
end
|
72
|
-
end
|
87
|
+
end
|
88
|
+
end
|
data/lib/ecfs/util.rb
CHANGED
@@ -1,10 +1,15 @@
|
|
1
1
|
module ECFS
|
2
2
|
module Util
|
3
|
-
def
|
3
|
+
def format_iso_date(date)
|
4
4
|
# input format 12/22/1988
|
5
5
|
chunks = date.split("/")
|
6
6
|
new_date = "#{chunks[2]}-#{chunks[0]}-#{chunks[1]}" # "22-12-1988"
|
7
7
|
"#{new_date}T00:00:00.000Z" # dumb hack
|
8
8
|
end
|
9
|
+
|
10
|
+
def iso_date_to_simple_date(iso_date)
|
11
|
+
chunks = iso_date.split("T")[0].split("-")
|
12
|
+
"#{chunks[1]}-#{chunks[0]}-#{chunks[2]}"
|
13
|
+
end
|
9
14
|
end
|
10
15
|
end
|
data/lib/ecfs/version.rb
CHANGED
data/test/test_filing.rb
CHANGED
@@ -42,6 +42,10 @@ class TestFiling < Test::Unit::TestCase
|
|
42
42
|
assert_equal ECFS::Document::Page, page.class
|
43
43
|
assert_equal String, page.text.class
|
44
44
|
assert_equal Fixnum, page.page_number.class
|
45
|
+
|
46
|
+
#VCR.use_cassette('test_proceedings_query_test_get_proceeding_info') do
|
47
|
+
# binding.pry
|
48
|
+
#end
|
45
49
|
end
|
46
50
|
end
|
47
51
|
end
|
data/test/test_proceeding.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ecfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-05-
|
12
|
+
date: 2013-05-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|