mediawiki_table_scraper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +1 -0
- data.tar.gz.sig +0 -0
- data/lib/mediawiki_table_scraper.rb +56 -0
- metadata +87 -0
- metadata.gz.sig +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 82745728df615aa53c1e9275ddfabdd1476217f8
|
4
|
+
data.tar.gz: 9a1115015b99c30cbea93f6947ce0c1fc622eff0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 82e76b145d542b0b953d3600ededb71776e36941ec01b0d3442c937289d7d9c481a302e3f387d5cc49e028f40061338fc92032f6f8c033d96441b73b8a9b1122
|
7
|
+
data.tar.gz: 734127c59510a08c4171f6eff361b9b413eeb64775f4e1a3d6debef66fcb7211865a06da72cb7dadd37514ce529f3df67b10d91cf167a10375865836def10dc9
|
checksums.yaml.gz.sig
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
p�"u��0���v��~�#g�k[^ð�̿������ۊ%Ǡl�3yڳ
|
data.tar.gz.sig
ADDED
Binary file
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# file: mediawiki_table_scraper.rb
|
4
|
+
|
5
|
+
|
6
|
+
require 'nokorexi'
|
7
|
+
|
8
|
+
|
9
|
+
class MediaWikiTableScraper
|
10
|
+
|
11
|
+
attr_reader :tables
|
12
|
+
|
13
|
+
def initialize(url)
|
14
|
+
|
15
|
+
doc = Nokorexi.new(url).to_doc
|
16
|
+
|
17
|
+
tables = doc.root.css('.wikitable')
|
18
|
+
# Fetch the records as an array of hash records for each table
|
19
|
+
|
20
|
+
@tables = tables.map do |table|
|
21
|
+
|
22
|
+
rows = table.xpath 'tr'
|
23
|
+
|
24
|
+
# fetch the column names
|
25
|
+
labels = rows.shift.xpath 'th/text()'
|
26
|
+
names = labels.map {|x| x.downcase.to_sym }
|
27
|
+
|
28
|
+
a = rows.map do |row|
|
29
|
+
|
30
|
+
row.xpath('td').map do |x|
|
31
|
+
|
32
|
+
if x.has_elements? then
|
33
|
+
|
34
|
+
x.children.map do |c|
|
35
|
+
c.is_a?(String) ? c : c.xml.gsub(/<\/?\w+[^>]*>/,'')
|
36
|
+
end.join ' '
|
37
|
+
|
38
|
+
else
|
39
|
+
x.text.to_s
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
a2 = a.map {|rows| names.zip(rows).to_h }
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def to_a()
|
52
|
+
@tables
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
metadata
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mediawiki_table_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- James Robertson
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain:
|
11
|
+
- |
|
12
|
+
-----BEGIN CERTIFICATE-----
|
13
|
+
MIIDljCCAn6gAwIBAgIBATANBgkqhkiG9w0BAQUFADBIMRIwEAYDVQQDDAlnZW1t
|
14
|
+
YXN0ZXIxHjAcBgoJkiaJk/IsZAEZFg5qYW1lc3JvYmVydHNvbjESMBAGCgmSJomT
|
15
|
+
8ixkARkWAmV1MB4XDTE2MDYxNDE1MTQwOVoXDTE3MDYxNDE1MTQwOVowSDESMBAG
|
16
|
+
A1UEAwwJZ2VtbWFzdGVyMR4wHAYKCZImiZPyLGQBGRYOamFtZXNyb2JlcnRzb24x
|
17
|
+
EjAQBgoJkiaJk/IsZAEZFgJldTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoC
|
18
|
+
ggEBALc/0KGICil5uPNrZZUisbHWa1tPdaZoRfO03/t5KI1XdnT/1K8bcuw8j5Fb
|
19
|
+
2e+2QnDfIHVj3F9hZ4BtA1Z4lF7PxlHtQ4SjhxBLeLmcZtX8ZFIi73PLNItwtOf/
|
20
|
+
CSI8oGyNxFUKzcbNtRTat8+jRUSs15vUrfxQD/q0RAZTnfamLj1b6ijrL1nqk36T
|
21
|
+
x4wkEEdyPwou7I8qKJcaYHVrYCrbR5x9ZD77fqsLPNkIoI+SoBQa39+Ph+EjpXHV
|
22
|
+
qg4bOklAI3Wmn/nQhepuQ9dCQM5zIn85WpwAiK5QeIaqrvnGuBztW5KRBa1diQit
|
23
|
+
aGKlLl962VtYTp+uALhDBHo/z00CAwEAAaOBijCBhzAJBgNVHRMEAjAAMAsGA1Ud
|
24
|
+
DwQEAwIEsDAdBgNVHQ4EFgQUnLL92rOFDH49Ig6knBxlNicuha4wJgYDVR0RBB8w
|
25
|
+
HYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1h
|
26
|
+
c3RlckBqYW1lc3JvYmVydHNvbi5ldTANBgkqhkiG9w0BAQUFAAOCAQEADi7zgG/H
|
27
|
+
OjC7uEVwwyhHNAt2lW+FdETDGpvJmcB/Z5lB4gbDEozB+hOnTd3oKH8DZeoyEi/Y
|
28
|
+
3QmxuNDvAKeNV5bngMxG/5k+zeY4tenyK2K1VVzlV8zgfnd3JWpqbDGRjOYqV+2K
|
29
|
+
IZxoUnra09diNAo5c74oOaxfS75Tfle2zwjDLHAJat+kxhVmsnMXBSDpE4RDz8E3
|
30
|
+
aWA3AnmwsgbUbHlniNMwSgSn8JzmCp0vRRIyN1Lw2rmoX1IXsAsAk1t/2RwBn+LC
|
31
|
+
FWN+XXB3cbVDsx+uRkGyPtPZZbuqezbtXZUMmv+4kceg02I8lDt4PEk8Hrd5ybDi
|
32
|
+
WVhFo+q8CNPc1Q==
|
33
|
+
-----END CERTIFICATE-----
|
34
|
+
date: 2016-06-14 00:00:00.000000000 Z
|
35
|
+
dependencies:
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: nokorexi
|
38
|
+
requirement: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - "~>"
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '0.3'
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 0.3.1
|
46
|
+
type: :runtime
|
47
|
+
prerelease: false
|
48
|
+
version_requirements: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - "~>"
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0.3'
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: 0.3.1
|
56
|
+
description:
|
57
|
+
email: james@r0bertson.co.uk
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- lib/mediawiki_table_scraper.rb
|
63
|
+
homepage: https://github.com/jrobertson/mediawiki_table_scraper
|
64
|
+
licenses:
|
65
|
+
- MIT
|
66
|
+
metadata: {}
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options: []
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project:
|
83
|
+
rubygems_version: 2.5.1
|
84
|
+
signing_key:
|
85
|
+
specification_version: 4
|
86
|
+
summary: Scrapes the tables from a MediaWiki page.
|
87
|
+
test_files: []
|
metadata.gz.sig
ADDED
Binary file
|