mediawiki_table_scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 82745728df615aa53c1e9275ddfabdd1476217f8
4
+ data.tar.gz: 9a1115015b99c30cbea93f6947ce0c1fc622eff0
5
+ SHA512:
6
+ metadata.gz: 82e76b145d542b0b953d3600ededb71776e36941ec01b0d3442c937289d7d9c481a302e3f387d5cc49e028f40061338fc92032f6f8c033d96441b73b8a9b1122
7
+ data.tar.gz: 734127c59510a08c4171f6eff361b9b413eeb64775f4e1a3d6debef66fcb7211865a06da72cb7dadd37514ce529f3df67b10d91cf167a10375865836def10dc9
@@ -0,0 +1 @@
1
+ p�"u��0���v��~�#g�k[^ð�̿������ۊ%Ǡl�3yڳ
Binary file
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # file: mediawiki_table_scraper.rb
4
+
5
+
6
+ require 'nokorexi'
7
+
8
+
9
+ class MediaWikiTableScraper
10
+
11
+ attr_reader :tables
12
+
13
+ def initialize(url)
14
+
15
+ doc = Nokorexi.new(url).to_doc
16
+
17
+ tables = doc.root.css('.wikitable')
18
+ # Fetch the records as an array of hash records for each table
19
+
20
+ @tables = tables.map do |table|
21
+
22
+ rows = table.xpath 'tr'
23
+
24
+ # fetch the column names
25
+ labels = rows.shift.xpath 'th/text()'
26
+ names = labels.map {|x| x.downcase.to_sym }
27
+
28
+ a = rows.map do |row|
29
+
30
+ row.xpath('td').map do |x|
31
+
32
+ if x.has_elements? then
33
+
34
+ x.children.map do |c|
35
+ c.is_a?(String) ? c : c.xml.gsub(/<\/?\w+[^>]*>/,'')
36
+ end.join ' '
37
+
38
+ else
39
+ x.text.to_s
40
+ end
41
+
42
+ end
43
+
44
+ end
45
+
46
+ a2 = a.map {|rows| names.zip(rows).to_h }
47
+
48
+ end
49
+ end
50
+
51
+ def to_a()
52
+ @tables
53
+ end
54
+
55
+ end
56
+
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mediawiki_table_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - James Robertson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain:
11
+ - |
12
+ -----BEGIN CERTIFICATE-----
13
+ MIIDljCCAn6gAwIBAgIBATANBgkqhkiG9w0BAQUFADBIMRIwEAYDVQQDDAlnZW1t
14
+ YXN0ZXIxHjAcBgoJkiaJk/IsZAEZFg5qYW1lc3JvYmVydHNvbjESMBAGCgmSJomT
15
+ 8ixkARkWAmV1MB4XDTE2MDYxNDE1MTQwOVoXDTE3MDYxNDE1MTQwOVowSDESMBAG
16
+ A1UEAwwJZ2VtbWFzdGVyMR4wHAYKCZImiZPyLGQBGRYOamFtZXNyb2JlcnRzb24x
17
+ EjAQBgoJkiaJk/IsZAEZFgJldTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoC
18
+ ggEBALc/0KGICil5uPNrZZUisbHWa1tPdaZoRfO03/t5KI1XdnT/1K8bcuw8j5Fb
19
+ 2e+2QnDfIHVj3F9hZ4BtA1Z4lF7PxlHtQ4SjhxBLeLmcZtX8ZFIi73PLNItwtOf/
20
+ CSI8oGyNxFUKzcbNtRTat8+jRUSs15vUrfxQD/q0RAZTnfamLj1b6ijrL1nqk36T
21
+ x4wkEEdyPwou7I8qKJcaYHVrYCrbR5x9ZD77fqsLPNkIoI+SoBQa39+Ph+EjpXHV
22
+ qg4bOklAI3Wmn/nQhepuQ9dCQM5zIn85WpwAiK5QeIaqrvnGuBztW5KRBa1diQit
23
+ aGKlLl962VtYTp+uALhDBHo/z00CAwEAAaOBijCBhzAJBgNVHRMEAjAAMAsGA1Ud
24
+ DwQEAwIEsDAdBgNVHQ4EFgQUnLL92rOFDH49Ig6knBxlNicuha4wJgYDVR0RBB8w
25
+ HYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1h
26
+ c3RlckBqYW1lc3JvYmVydHNvbi5ldTANBgkqhkiG9w0BAQUFAAOCAQEADi7zgG/H
27
+ OjC7uEVwwyhHNAt2lW+FdETDGpvJmcB/Z5lB4gbDEozB+hOnTd3oKH8DZeoyEi/Y
28
+ 3QmxuNDvAKeNV5bngMxG/5k+zeY4tenyK2K1VVzlV8zgfnd3JWpqbDGRjOYqV+2K
29
+ IZxoUnra09diNAo5c74oOaxfS75Tfle2zwjDLHAJat+kxhVmsnMXBSDpE4RDz8E3
30
+ aWA3AnmwsgbUbHlniNMwSgSn8JzmCp0vRRIyN1Lw2rmoX1IXsAsAk1t/2RwBn+LC
31
+ FWN+XXB3cbVDsx+uRkGyPtPZZbuqezbtXZUMmv+4kceg02I8lDt4PEk8Hrd5ybDi
32
+ WVhFo+q8CNPc1Q==
33
+ -----END CERTIFICATE-----
34
+ date: 2016-06-14 00:00:00.000000000 Z
35
+ dependencies:
36
+ - !ruby/object:Gem::Dependency
37
+ name: nokorexi
38
+ requirement: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - "~>"
41
+ - !ruby/object:Gem::Version
42
+ version: '0.3'
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: 0.3.1
46
+ type: :runtime
47
+ prerelease: false
48
+ version_requirements: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - "~>"
51
+ - !ruby/object:Gem::Version
52
+ version: '0.3'
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: 0.3.1
56
+ description:
57
+ email: james@r0bertson.co.uk
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - lib/mediawiki_table_scraper.rb
63
+ homepage: https://github.com/jrobertson/mediawiki_table_scraper
64
+ licenses:
65
+ - MIT
66
+ metadata: {}
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 2.5.1
84
+ signing_key:
85
+ specification_version: 4
86
+ summary: Scrapes the tables from a MediaWiki page.
87
+ test_files: []
Binary file