mediawiki_table_scraper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 82745728df615aa53c1e9275ddfabdd1476217f8
4
+ data.tar.gz: 9a1115015b99c30cbea93f6947ce0c1fc622eff0
5
+ SHA512:
6
+ metadata.gz: 82e76b145d542b0b953d3600ededb71776e36941ec01b0d3442c937289d7d9c481a302e3f387d5cc49e028f40061338fc92032f6f8c033d96441b73b8a9b1122
7
+ data.tar.gz: 734127c59510a08c4171f6eff361b9b413eeb64775f4e1a3d6debef66fcb7211865a06da72cb7dadd37514ce529f3df67b10d91cf167a10375865836def10dc9
@@ -0,0 +1 @@
1
+ p�"u��0���v��~�#g�k[^ð�̿������ۊ%Ǡl�3yڳ
Binary file
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # file: mediawiki_table_scraper.rb
4
+
5
+
6
+ require 'nokorexi'
7
+
8
+
9
+ class MediaWikiTableScraper
10
+
11
+ attr_reader :tables
12
+
13
+ def initialize(url)
14
+
15
+ doc = Nokorexi.new(url).to_doc
16
+
17
+ tables = doc.root.css('.wikitable')
18
+ # Fetch the records as an array of hash records for each table
19
+
20
+ @tables = tables.map do |table|
21
+
22
+ rows = table.xpath 'tr'
23
+
24
+ # fetch the column names
25
+ labels = rows.shift.xpath 'th/text()'
26
+ names = labels.map {|x| x.downcase.to_sym }
27
+
28
+ a = rows.map do |row|
29
+
30
+ row.xpath('td').map do |x|
31
+
32
+ if x.has_elements? then
33
+
34
+ x.children.map do |c|
35
+ c.is_a?(String) ? c : c.xml.gsub(/<\/?\w+[^>]*>/,'')
36
+ end.join ' '
37
+
38
+ else
39
+ x.text.to_s
40
+ end
41
+
42
+ end
43
+
44
+ end
45
+
46
+ a2 = a.map {|rows| names.zip(rows).to_h }
47
+
48
+ end
49
+ end
50
+
51
+ def to_a()
52
+ @tables
53
+ end
54
+
55
+ end
56
+
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mediawiki_table_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - James Robertson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain:
11
+ - |
12
+ -----BEGIN CERTIFICATE-----
13
+ MIIDljCCAn6gAwIBAgIBATANBgkqhkiG9w0BAQUFADBIMRIwEAYDVQQDDAlnZW1t
14
+ YXN0ZXIxHjAcBgoJkiaJk/IsZAEZFg5qYW1lc3JvYmVydHNvbjESMBAGCgmSJomT
15
+ 8ixkARkWAmV1MB4XDTE2MDYxNDE1MTQwOVoXDTE3MDYxNDE1MTQwOVowSDESMBAG
16
+ A1UEAwwJZ2VtbWFzdGVyMR4wHAYKCZImiZPyLGQBGRYOamFtZXNyb2JlcnRzb24x
17
+ EjAQBgoJkiaJk/IsZAEZFgJldTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoC
18
+ ggEBALc/0KGICil5uPNrZZUisbHWa1tPdaZoRfO03/t5KI1XdnT/1K8bcuw8j5Fb
19
+ 2e+2QnDfIHVj3F9hZ4BtA1Z4lF7PxlHtQ4SjhxBLeLmcZtX8ZFIi73PLNItwtOf/
20
+ CSI8oGyNxFUKzcbNtRTat8+jRUSs15vUrfxQD/q0RAZTnfamLj1b6ijrL1nqk36T
21
+ x4wkEEdyPwou7I8qKJcaYHVrYCrbR5x9ZD77fqsLPNkIoI+SoBQa39+Ph+EjpXHV
22
+ qg4bOklAI3Wmn/nQhepuQ9dCQM5zIn85WpwAiK5QeIaqrvnGuBztW5KRBa1diQit
23
+ aGKlLl962VtYTp+uALhDBHo/z00CAwEAAaOBijCBhzAJBgNVHRMEAjAAMAsGA1Ud
24
+ DwQEAwIEsDAdBgNVHQ4EFgQUnLL92rOFDH49Ig6knBxlNicuha4wJgYDVR0RBB8w
25
+ HYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1h
26
+ c3RlckBqYW1lc3JvYmVydHNvbi5ldTANBgkqhkiG9w0BAQUFAAOCAQEADi7zgG/H
27
+ OjC7uEVwwyhHNAt2lW+FdETDGpvJmcB/Z5lB4gbDEozB+hOnTd3oKH8DZeoyEi/Y
28
+ 3QmxuNDvAKeNV5bngMxG/5k+zeY4tenyK2K1VVzlV8zgfnd3JWpqbDGRjOYqV+2K
29
+ IZxoUnra09diNAo5c74oOaxfS75Tfle2zwjDLHAJat+kxhVmsnMXBSDpE4RDz8E3
30
+ aWA3AnmwsgbUbHlniNMwSgSn8JzmCp0vRRIyN1Lw2rmoX1IXsAsAk1t/2RwBn+LC
31
+ FWN+XXB3cbVDsx+uRkGyPtPZZbuqezbtXZUMmv+4kceg02I8lDt4PEk8Hrd5ybDi
32
+ WVhFo+q8CNPc1Q==
33
+ -----END CERTIFICATE-----
34
+ date: 2016-06-14 00:00:00.000000000 Z
35
+ dependencies:
36
+ - !ruby/object:Gem::Dependency
37
+ name: nokorexi
38
+ requirement: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - "~>"
41
+ - !ruby/object:Gem::Version
42
+ version: '0.3'
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: 0.3.1
46
+ type: :runtime
47
+ prerelease: false
48
+ version_requirements: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - "~>"
51
+ - !ruby/object:Gem::Version
52
+ version: '0.3'
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: 0.3.1
56
+ description:
57
+ email: james@r0bertson.co.uk
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - lib/mediawiki_table_scraper.rb
63
+ homepage: https://github.com/jrobertson/mediawiki_table_scraper
64
+ licenses:
65
+ - MIT
66
+ metadata: {}
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 2.5.1
84
+ signing_key:
85
+ specification_version: 4
86
+ summary: Scrapes the tables from a MediaWiki page.
87
+ test_files: []
Binary file