ecfs 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/ecfs/filings_query.rb +11 -19
- data/lib/ecfs/proceeding.rb +1 -1
- data/lib/ecfs/proceedings_query.rb +110 -58
- data/lib/ecfs/spreadsheet_parser.rb +54 -38
- data/lib/ecfs/util.rb +6 -1
- data/lib/ecfs/version.rb +1 -1
- data/test/test_filing.rb +4 -0
- data/test/test_proceeding.rb +1 -0
- metadata +2 -2
data/lib/ecfs/filings_query.rb
CHANGED
@@ -54,14 +54,8 @@ module ECFS
|
|
54
54
|
end
|
55
55
|
|
56
56
|
def get
|
57
|
-
|
58
|
-
|
59
|
-
return rows.map do |row|
|
60
|
-
row_to_filing(row)
|
61
|
-
end
|
62
|
-
else
|
63
|
-
return rows
|
64
|
-
end
|
57
|
+
download_spreadsheet!
|
58
|
+
@typecast_results ? @rows.map {|row| row_to_filing(row)} : @rows
|
65
59
|
end
|
66
60
|
|
67
61
|
def row_to_filing(row)
|
@@ -69,20 +63,18 @@ module ECFS
|
|
69
63
|
end
|
70
64
|
|
71
65
|
def mechanize_agent
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
agent
|
66
|
+
Mechanize.new.tap do |agent|
|
67
|
+
agent.follow_meta_refresh = true
|
68
|
+
agent.pluggable_parser["application/vnd.ms-excel"] = ECFS::SpreadsheetParser
|
69
|
+
end
|
77
70
|
end
|
78
71
|
|
79
|
-
def download_spreadsheet
|
80
|
-
agent =
|
81
|
-
page = agent.get(self.url)
|
72
|
+
def download_spreadsheet!
|
73
|
+
agent = mechanize_agent
|
82
74
|
link_text = "\r\n \t \t \tExport to Excel file\r\n \t \t"
|
83
|
-
link =
|
84
|
-
|
85
|
-
agent.click(link)
|
75
|
+
link = agent.get(url).link_with(:text => link_text)
|
76
|
+
|
77
|
+
@rows = agent.click(link).rows
|
86
78
|
end
|
87
79
|
end
|
88
80
|
end
|
data/lib/ecfs/proceeding.rb
CHANGED
@@ -51,8 +51,8 @@ module ECFS
|
|
51
51
|
# if docket_number is given along with other constraints, the other constraints will be ignored.
|
52
52
|
warn "Constraints other than `docket_number` will be ignored." if @constraints.keys.length > 1
|
53
53
|
|
54
|
-
return
|
55
|
-
results = ECFS::Proceeding.new(
|
54
|
+
return scrape_proceeding_page! unless @typecast_results
|
55
|
+
results = ECFS::Proceeding.new(scrape_proceeding_page!)
|
56
56
|
else
|
57
57
|
return scrape_results_page unless @typecast_results
|
58
58
|
results = ECFS::Proceeding::ResultSet.new(scrape_results_page)
|
@@ -61,85 +61,136 @@ module ECFS
|
|
61
61
|
results
|
62
62
|
end
|
63
63
|
|
64
|
-
def mechanize_page
|
65
|
-
Mechanize.new.get(self.url)
|
66
|
-
end
|
67
64
|
|
68
65
|
private
|
69
66
|
|
70
|
-
def
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
if search.length > 0
|
83
|
-
key = search.first.children.first.text.lstrip.rstrip.split(":")[0].gsub(" ", "_").downcase
|
84
|
-
pair << key
|
85
|
-
else
|
86
|
-
value = span.text.lstrip.rstrip
|
87
|
-
value.gsub!(",", "") if value.is_a?(String)
|
88
|
-
pair << value
|
89
|
-
end
|
90
|
-
container << pair
|
67
|
+
def mechanize_page
|
68
|
+
Mechanize.new.get(url)
|
69
|
+
end
|
70
|
+
|
71
|
+
def scrape_proceeding_page!
|
72
|
+
container_to_hash do
|
73
|
+
mechanize_page.search("div").select do |div|
|
74
|
+
div.attributes["class"].nil? == false
|
75
|
+
end.select do |div|
|
76
|
+
div.attributes["class"].text == "wwgrp"
|
77
|
+
end.map do |node|
|
78
|
+
search_node(node)
|
91
79
|
end
|
92
80
|
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def container_to_hash(&block)
|
93
84
|
hash = {}
|
94
|
-
|
95
|
-
hash
|
85
|
+
block.call.flatten!.each_slice(2) do |chunk|
|
86
|
+
hash[chunk[0]] = chunk[1]
|
96
87
|
end
|
97
88
|
|
98
|
-
hash["date_created"] = format_date(hash["date_created"])
|
99
|
-
|
100
89
|
hash
|
101
90
|
end
|
102
91
|
|
103
|
-
def
|
104
|
-
|
92
|
+
def search_node(node)
|
93
|
+
node.search("span").map do |span|
|
94
|
+
search = span.search("label")
|
95
|
+
key_or_value_from_search_and_span(search, span)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def key_or_value_from_search_and_span(search, span)
|
100
|
+
search.length > 0 ? key_from_search(search) : value_from_span(span)
|
101
|
+
end
|
102
|
+
|
103
|
+
def key_from_search(search)
|
104
|
+
format_key_text(search.first.children.first.text)
|
105
|
+
end
|
106
|
+
|
107
|
+
def format_key_text(key_text)
|
108
|
+
key_text.lstrip!.rstrip!
|
109
|
+
key_text = key_text.split(":")[0]
|
110
|
+
key_text.gsub!(" ", "_")
|
111
|
+
key_text.downcase!
|
112
|
+
end
|
113
|
+
|
114
|
+
def value_from_span(span)
|
115
|
+
value = text_from_span(span)
|
116
|
+
value.gsub!(",", "") if value.is_a?(String)
|
117
|
+
|
118
|
+
value
|
119
|
+
end
|
120
|
+
|
121
|
+
def text_from_span(span)
|
122
|
+
span.text.lstrip.rstrip
|
123
|
+
end
|
105
124
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
last = banner[3].gsub(",","").to_i
|
110
|
-
total = banner[5].gsub(",","").to_i
|
111
|
-
table_rows = page.search("//*[@id='yui-main']/div/div[2]/table/tbody/tr[2]/td/table/tbody").children
|
112
|
-
results = table_rows.map { |row| row_to_proceeding(row) }
|
125
|
+
def scrape_results_page
|
126
|
+
page = mechanize_page
|
127
|
+
banner = extract_banner_from_page(page)
|
113
128
|
|
114
129
|
{
|
115
130
|
"constraints" => @constraints,
|
116
|
-
"fcc_url" =>
|
117
|
-
"current_page" =>
|
118
|
-
"total_pages" =>
|
119
|
-
"first_result" =>
|
120
|
-
"last_result" =>
|
121
|
-
"total_results" =>
|
122
|
-
"results" =>
|
131
|
+
"fcc_url" => url,
|
132
|
+
"current_page" => current_page,
|
133
|
+
"total_pages" => total_pages_from_page(page),
|
134
|
+
"first_result" => first_from_banner(banner),
|
135
|
+
"last_result" => last_from_banner(banner),
|
136
|
+
"total_results" => total_from_banner(banner),
|
137
|
+
"results" => proceedings_from_page(page)
|
123
138
|
}
|
124
139
|
end
|
125
140
|
|
126
|
-
def
|
127
|
-
|
141
|
+
def current_page
|
142
|
+
self.constraints["page_number"].gsub(",","").to_i
|
143
|
+
end
|
128
144
|
|
129
|
-
|
145
|
+
def proceedings_from_page(page)
|
146
|
+
extract_table_rows_from_page(page).map do |row|
|
147
|
+
row_to_proceeding(row)
|
148
|
+
end
|
130
149
|
end
|
131
150
|
|
132
|
-
def
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
151
|
+
def extract_table_rows_from_page(page)
|
152
|
+
xpath = "//*[@id='yui-main']/div/div[2]/table/tbody/tr[2]/td/table/tbody"
|
153
|
+
page.search(xpath).children
|
154
|
+
end
|
155
|
+
|
156
|
+
def first_from_banner(banner)
|
157
|
+
extract_from_banner(banner, 1)
|
158
|
+
end
|
159
|
+
|
160
|
+
def last_from_banner(banner)
|
161
|
+
extract_from_banner(banner, 3)
|
162
|
+
end
|
163
|
+
|
164
|
+
def total_from_banner(banner)
|
165
|
+
extract_from_banner(banner, 5)
|
166
|
+
end
|
167
|
+
|
168
|
+
def extract_banner_from_page(page)
|
169
|
+
xpath = "//*[@id='yui-main']/div/div[2]/table/tbody/tr[2]/td/span[1]"
|
170
|
+
page.search(xpath).text.tap do |t|
|
171
|
+
t.lstrip!
|
172
|
+
t.rstrip!
|
173
|
+
end.split("Modify Search")[0].rstrip.split
|
174
|
+
end
|
137
175
|
|
176
|
+
def extract_from_banner(banner, index)
|
177
|
+
banner[index].gsub(",", "").to_i
|
178
|
+
end
|
179
|
+
|
180
|
+
def total_pages_from_page(page)
|
181
|
+
page.link_with(:text => "Last").attributes.first[1].split("pageNumber=")[1].gsub(",","").to_i
|
182
|
+
end
|
183
|
+
|
184
|
+
def row_to_proceeding(row)
|
185
|
+
ECFS::Proceeding.new(row_to_hash(row))
|
186
|
+
end
|
187
|
+
|
188
|
+
def row_to_hash(row)
|
138
189
|
{
|
139
|
-
"docket_number" =>
|
140
|
-
"bureau" =>
|
141
|
-
"subject" =>
|
142
|
-
"filings_in_last_30_days" =>
|
190
|
+
"docket_number" => docket_number_from_row(row),
|
191
|
+
"bureau" => bureau_from_row(row),
|
192
|
+
"subject" => subject_from_row(row),
|
193
|
+
"filings_in_last_30_days" => filings_in_last_30_days_from_row(row)
|
143
194
|
}
|
144
195
|
end
|
145
196
|
|
@@ -158,6 +209,7 @@ module ECFS
|
|
158
209
|
def filings_in_last_30_days_from_row(row)
|
159
210
|
row.children[6].children.first.text.lstrip.rstrip.to_i
|
160
211
|
end
|
212
|
+
#####
|
161
213
|
|
162
214
|
end
|
163
215
|
end
|
@@ -11,62 +11,78 @@ module ECFS
|
|
11
11
|
def initialize(uri = nil, response = nil, body = nil, code = nil)
|
12
12
|
super(uri, response, body, code)
|
13
13
|
@body = body
|
14
|
-
|
15
|
-
format_rows!
|
14
|
+
@rows = formatted_rows
|
16
15
|
end
|
17
16
|
|
18
17
|
private
|
19
18
|
|
20
|
-
def
|
21
|
-
|
22
|
-
|
23
|
-
|
19
|
+
def file
|
20
|
+
StringIO.new(@body)
|
21
|
+
end
|
22
|
+
|
23
|
+
def book
|
24
|
+
Spreadsheet.open(file)
|
25
|
+
end
|
26
|
+
|
27
|
+
def sheet
|
28
|
+
book.worksheet(0)
|
29
|
+
end
|
30
|
+
|
31
|
+
def unformatted_rows
|
32
|
+
my_rows = []
|
24
33
|
first = false
|
25
|
-
|
26
|
-
|
34
|
+
sheet.each do |row|
|
35
|
+
my_rows << row if first
|
27
36
|
first = true
|
28
37
|
end
|
29
38
|
|
30
|
-
|
39
|
+
my_rows
|
31
40
|
end
|
32
41
|
|
33
|
-
def
|
34
|
-
|
35
|
-
|
36
|
-
indices = (7..row.length-1).to_a
|
37
|
-
indices.each do |i|
|
38
|
-
text = row[i].data.split("id=")[1]
|
39
|
-
urls << "http://apps.fcc.gov/ecfs/document/view?id=#{extract_filing_id(text)}"
|
40
|
-
end
|
41
|
-
|
42
|
-
{
|
43
|
-
"name_of_filer" => row[1],
|
44
|
-
"docket_number" => row[0],
|
45
|
-
"lawfirm_name" => row[2],
|
46
|
-
"date_received" => format_date(row[3]),
|
47
|
-
"date_posted" => format_date(row[4]),
|
48
|
-
"exparte" => format_exparte(row[5]),
|
49
|
-
"type_of_filing" => row[6],
|
50
|
-
"document_urls" => urls
|
51
|
-
}
|
42
|
+
def formatted_rows
|
43
|
+
unformatted_rows.map do |row|
|
44
|
+
row_to_hash(row)
|
52
45
|
end
|
53
46
|
end
|
54
47
|
|
55
|
-
def
|
56
|
-
|
57
|
-
|
58
|
-
|
48
|
+
def row_to_hash(row)
|
49
|
+
{
|
50
|
+
"name_of_filer" => row[1],
|
51
|
+
"docket_number" => row[0],
|
52
|
+
"lawfirm_name" => row[2],
|
53
|
+
"date_received" => format_iso_date(row[3]),
|
54
|
+
"date_posted" => format_iso_date(row[4]),
|
55
|
+
"exparte" => bool_from_exparte(row[5]),
|
56
|
+
"type_of_filing" => row[6],
|
57
|
+
"document_urls" => urls_from_row(row)
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
def urls_from_row(row)
|
62
|
+
indices = (7..row.length-1).to_a
|
63
|
+
|
64
|
+
indices.map do |index|
|
65
|
+
extract_url_from_row_and_index(row, index)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def extract_url_from_row_and_index(row, index)
|
70
|
+
text = row[index].data.split("id=")[1]
|
71
|
+
|
72
|
+
"http://apps.fcc.gov/ecfs/document/view?id=#{extract_filing_id(text)}"
|
59
73
|
end
|
60
74
|
|
61
75
|
def extract_filing_id(txt)
|
62
76
|
re1='(\\d+)'
|
63
77
|
re=(re1)
|
64
78
|
m = Regexp.new(re, Regexp::IGNORECASE)
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
79
|
+
|
80
|
+
m.match(txt)[1]
|
81
|
+
end
|
82
|
+
|
83
|
+
def bool_from_exparte(y_or_n)
|
84
|
+
{"Y" => true, "N" => false}[y_or_n]
|
69
85
|
end
|
70
86
|
|
71
|
-
end
|
72
|
-
end
|
87
|
+
end
|
88
|
+
end
|
data/lib/ecfs/util.rb
CHANGED
@@ -1,10 +1,15 @@
|
|
1
1
|
module ECFS
|
2
2
|
module Util
|
3
|
-
def
|
3
|
+
def format_iso_date(date)
|
4
4
|
# input format 12/22/1988
|
5
5
|
chunks = date.split("/")
|
6
6
|
new_date = "#{chunks[2]}-#{chunks[0]}-#{chunks[1]}" # "22-12-1988"
|
7
7
|
"#{new_date}T00:00:00.000Z" # dumb hack
|
8
8
|
end
|
9
|
+
|
10
|
+
def iso_date_to_simple_date(iso_date)
|
11
|
+
chunks = iso_date.split("T")[0].split("-")
|
12
|
+
"#{chunks[1]}-#{chunks[0]}-#{chunks[2]}"
|
13
|
+
end
|
9
14
|
end
|
10
15
|
end
|
data/lib/ecfs/version.rb
CHANGED
data/test/test_filing.rb
CHANGED
@@ -42,6 +42,10 @@ class TestFiling < Test::Unit::TestCase
|
|
42
42
|
assert_equal ECFS::Document::Page, page.class
|
43
43
|
assert_equal String, page.text.class
|
44
44
|
assert_equal Fixnum, page.page_number.class
|
45
|
+
|
46
|
+
#VCR.use_cassette('test_proceedings_query_test_get_proceeding_info') do
|
47
|
+
# binding.pry
|
48
|
+
#end
|
45
49
|
end
|
46
50
|
end
|
47
51
|
end
|
data/test/test_proceeding.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ecfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-05-
|
12
|
+
date: 2013-05-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|