wikiscript 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/NOTES.md +30 -1
- data/lib/wikiscript.rb +19 -6
- data/lib/wikiscript/client.rb +12 -11
- data/lib/wikiscript/page.rb +13 -4
- data/lib/wikiscript/page_reader.rb +5 -3
- data/lib/wikiscript/table_reader.rb +20 -10
- data/lib/wikiscript/version.rb +1 -1
- data/test/test_page.rb +20 -6
- data/test/test_page_de.rb +12 -1
- data/test/test_page_reader.rb +2 -2
- data/test/test_table_reader.rb +37 -9
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fe20e2b58b4703144ef8a83d1cb41edf92f5ef3e
|
4
|
+
data.tar.gz: ae8a0d31b16099c33fee4ac450b27433852b8fb8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 61fe785e9ea9150f6a5711ff0038cb135aba18f719abf26d6783bf50c465678f7376c315bcce3091b325d8ac9ca4714e084f93503e21dae583cddb9c904863d5
|
7
|
+
data.tar.gz: c437c92ba5df517d21231060044ab690aba2b1b11a368d37018c65eb493cea467bbd3331f161f217c1764fc24025fcf97a57ab3889e5aa2fc6b53d57e460f13b
|
data/NOTES.md
CHANGED
@@ -3,4 +3,33 @@
|
|
3
3
|
|
4
4
|
## Alternatives
|
5
5
|
|
6
|
-
|
6
|
+
|
7
|
+
|
8
|
+
- [wikipedia-client](https://rubygems.org/gems/wikipedia-client) - by Ken Pratt et al - ruby client for the Wikipedia API
|
9
|
+
- <https://github.com/kenpratt/wikipedia-client>
|
10
|
+
- <https://www.rubydoc.info/gems/wikipedia-client>
|
11
|
+
|
12
|
+
<!-- break -->
|
13
|
+
|
14
|
+
- [infoboxer](https://rubygems.org/gems/infoboxer) - by Victor Shepelev et al - pure-Ruby Wikipedia (and generic MediaWiki) client and parser, targeting information extraction
|
15
|
+
- <https://github.com/molybdenum-99/infoboxer>
|
16
|
+
- <https://www.rubydoc.info/gems/infoboxer>
|
17
|
+
|
18
|
+
<!-- break -->
|
19
|
+
|
20
|
+
More
|
21
|
+
|
22
|
+
- <https://github.com/molybdenum-99/reality>
|
23
|
+
- https://github.com/molybdenum-99/mediawiktory
|
24
|
+
|
25
|
+
**Python**
|
26
|
+
|
27
|
+
- <https://pypi.org/project/wptools/> - Wikipedia tools (for Humans)
|
28
|
+
- <https://github.com/siznax/wptools/>
|
29
|
+
|
30
|
+
|
31
|
+
## Wikipedia
|
32
|
+
|
33
|
+
- Wikipedia API reference: <http://en.wikipedia.org/w/api.php>
|
34
|
+
|
35
|
+
|
data/lib/wikiscript.rb
CHANGED
@@ -51,11 +51,11 @@ module Wikiscript
|
|
51
51
|
}x
|
52
52
|
|
53
53
|
|
54
|
-
def self.unlink(
|
54
|
+
def self.unlink( text )
|
55
55
|
## replace ALL wiki links with title (or link)
|
56
56
|
## e.g. [[Santiago]] ([[La Florida, Chile|La Florida]])
|
57
57
|
## => Santiago (La Florida)
|
58
|
-
|
58
|
+
text = text.gsub( LINK_PATTERN ) do |_|
|
59
59
|
link = $~[:link]
|
60
60
|
title = $~[:title]
|
61
61
|
|
@@ -66,14 +66,16 @@ module Wikiscript
|
|
66
66
|
end
|
67
67
|
end
|
68
68
|
|
69
|
-
|
69
|
+
text.strip
|
70
|
+
end
|
71
|
+
class << self
|
72
|
+
alias_method :flatten_links, :unlink
|
70
73
|
end
|
71
74
|
|
72
|
-
|
73
|
-
def self.parse_link( value ) ## todo/change: find a better name - use match_link/etc. - why? why not?
|
75
|
+
def self.parse_link( text ) ## todo/change: find a better name - use match_link/etc. - why? why not?
|
74
76
|
## find first matching link
|
75
77
|
## return [nil,nil] if nothing found
|
76
|
-
if (m = LINK_PATTERN.match(
|
78
|
+
if (m = LINK_PATTERN.match( text ))
|
77
79
|
link = m[:link]
|
78
80
|
title = m[:title]
|
79
81
|
|
@@ -85,6 +87,17 @@ module Wikiscript
|
|
85
87
|
end
|
86
88
|
end
|
87
89
|
|
90
|
+
############################
|
91
|
+
## more convenience shortcuts / helpers
|
92
|
+
def self.parse( text ) PageReader.parse( text ); end
|
93
|
+
def self.parse_table( text ) TableReader.parse_table( text ); end
|
94
|
+
|
95
|
+
def self.get( title, lang: Wikiscript.lang ) Page.get( title, lang: lang ); end
|
96
|
+
class << self
|
97
|
+
alias_method :fetch, :get
|
98
|
+
alias_method :download, :get
|
99
|
+
end
|
100
|
+
|
88
101
|
end # module Wikiscript
|
89
102
|
|
90
103
|
|
data/lib/wikiscript/client.rb
CHANGED
@@ -15,30 +15,30 @@ module Wikiscript
|
|
15
15
|
@worker = Fetcher::Worker.new
|
16
16
|
end
|
17
17
|
|
18
|
-
## change to: wikitext why? why not? or to raw? why? why not?
|
19
|
-
def text( title )
|
18
|
+
## change to: wikitext or raw why? why not? or to raw? why? why not?
|
19
|
+
def text( title, lang: Wikiscript.lang )
|
20
20
|
## todo/fix: convert spaces to _ if not present for wikipedia page title - why ?? why not ???
|
21
|
-
get( action: 'raw', title: title )
|
22
|
-
end
|
23
21
|
|
24
|
-
|
25
|
-
def site_base
|
26
|
-
## replace lang w/ lang config if present e.g.
|
22
|
+
## note: replace lang w/ lang config if present e.g.
|
27
23
|
## http://{lang}.wikipedia.org/w/index.php
|
28
24
|
# becomes
|
29
25
|
# http://en.wikipedia.org/w/index.php or
|
30
26
|
# http://de.wikipedia.org/w/index.php etc
|
27
|
+
base_url = SITE_BASE.gsub( "{lang}", lang )
|
28
|
+
params = { action: 'raw',
|
29
|
+
title: title }
|
31
30
|
|
32
|
-
|
31
|
+
get( base_url, params )
|
33
32
|
end
|
34
33
|
|
34
|
+
private
|
35
35
|
def build_query( h )
|
36
36
|
h.map do |k,v|
|
37
37
|
"#{CGI.escape(k.to_s)}=#{CGI.escape(v.to_s)}"
|
38
38
|
end.join( '&' )
|
39
39
|
end
|
40
40
|
|
41
|
-
def get( params )
|
41
|
+
def get( base_url, params )
|
42
42
|
# note: lets us passing in params as hash e.g.
|
43
43
|
# action: 'raw', title: 'Austria'
|
44
44
|
# key and values will get CGI escaped
|
@@ -46,7 +46,7 @@ private
|
|
46
46
|
|
47
47
|
## uri = URI.parse( "#{SITE_BASE}?#{params}" )
|
48
48
|
## fix: pass in uri (add to fetcher check for is_a? URI etc.)
|
49
|
-
uri_string = "#{
|
49
|
+
uri_string = "#{base_url}?#{query}"
|
50
50
|
|
51
51
|
response = @worker.get_response( uri_string )
|
52
52
|
|
@@ -69,7 +69,8 @@ private
|
|
69
69
|
t
|
70
70
|
else
|
71
71
|
logger.error "fetch HTTP - #{response.code} #{response.message}"
|
72
|
-
|
72
|
+
exit 1 ### exit for now on error - why? why not?
|
73
|
+
## nil
|
73
74
|
end
|
74
75
|
end
|
75
76
|
|
data/lib/wikiscript/page.rb
CHANGED
@@ -6,15 +6,24 @@ module Wikiscript
|
|
6
6
|
|
7
7
|
include LogUtils::Logging
|
8
8
|
|
9
|
-
attr_reader :title
|
9
|
+
attr_reader :title, :lang
|
10
10
|
|
11
|
-
|
11
|
+
|
12
|
+
def self.get( title, lang: Wikiscript.lang ) ## todo/check: add a fetch/download alias - why? why not?
|
13
|
+
o = new( title: title, lang: lang )
|
14
|
+
## o.text ## "force" download / fetch
|
15
|
+
o
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
def initialize( text=nil, title: nil, lang: Wikiscript.lang )
|
12
20
|
## todo: check title
|
13
21
|
## replace title spaces w/ _ ????
|
14
22
|
## to allow "pretty" titles - why? why not??
|
15
23
|
|
16
|
-
@title = title
|
17
24
|
@text = text
|
25
|
+
@title = title
|
26
|
+
@lang = lang
|
18
27
|
end
|
19
28
|
|
20
29
|
def text
|
@@ -22,7 +31,7 @@ module Wikiscript
|
|
22
31
|
end
|
23
32
|
|
24
33
|
def download_text
|
25
|
-
Client.new.text( @title )
|
34
|
+
Client.new.text( @title, lang: @lang )
|
26
35
|
end
|
27
36
|
|
28
37
|
def parse ## todo/change: use/find a different name e.g. doc/elements/etc. - why? why not?
|
@@ -53,9 +53,11 @@ class PageReader
|
|
53
53
|
elsif inside_table
|
54
54
|
table_txt << line << "\n"
|
55
55
|
else
|
56
|
-
|
57
|
-
|
58
|
-
|
56
|
+
## note: skip unknown line types for now
|
57
|
+
|
58
|
+
## puts "** !!! ERROR !!! unknown line type in wiki page:"
|
59
|
+
## pp line
|
60
|
+
## exit 1
|
59
61
|
end
|
60
62
|
end
|
61
63
|
page
|
@@ -65,21 +65,31 @@ class TableReader
|
|
65
65
|
row = []
|
66
66
|
rows << row
|
67
67
|
end
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
68
|
+
if values.empty?
|
69
|
+
## note: support continuing column text in next line
|
70
|
+
row << String.new
|
71
|
+
else
|
72
|
+
## add each value one-by-one for now (to keep (same) row reference)
|
73
|
+
## note: also strip leading (optional) attributes
|
74
|
+
values.each do |value|
|
75
|
+
row << strip_emphases( strip_attributes( value.strip ))
|
76
|
+
end
|
72
77
|
end
|
73
78
|
elsif inside_table && line.start_with?( '|' ) ## table data
|
74
79
|
values = line.sub( '|', '' ).strip.split( '||' )
|
75
|
-
|
76
|
-
|
77
|
-
row <<
|
80
|
+
if values.empty?
|
81
|
+
## note: support continuing column text in next line
|
82
|
+
row << String.new
|
83
|
+
else
|
84
|
+
## add each value one-by-one for now (to keep (same) row reference)
|
85
|
+
values.each do |value|
|
86
|
+
row << strip_emphases( strip_attributes( value.strip ))
|
87
|
+
end
|
78
88
|
end
|
79
89
|
elsif inside_table
|
80
|
-
|
81
|
-
|
82
|
-
|
90
|
+
## note: support continuing column text in next line
|
91
|
+
## todo/check: for now doesn't support multi-line just simple continuing line - fix later if needed!!!
|
92
|
+
row[-1] << line
|
83
93
|
else
|
84
94
|
puts "!! ERROR !! unknown line type outside (before or after) table:"
|
85
95
|
puts line
|
data/lib/wikiscript/version.rb
CHANGED
data/test/test_page.rb
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
+
###
|
4
|
+
# to run use
|
5
|
+
# ruby -I ./lib -I ./test test/test_page.rb
|
6
|
+
|
3
7
|
|
4
8
|
require 'helper'
|
5
9
|
|
@@ -11,7 +15,12 @@ class TestPage < MiniTest::Test
|
|
11
15
|
end
|
12
16
|
|
13
17
|
def test_austria_en
|
14
|
-
page = Wikiscript::Page.
|
18
|
+
page = Wikiscript::Page.get( 'Austria' )
|
19
|
+
# [debug] GET /w/index.php?action=raw&title=Austria uri=http://en.wikipedia.org/w/index.php?action=raw&title=Austria, redirect_limit=5
|
20
|
+
# [debug] 301 TLS Redirect location=https://en.wikipedia.org/w/index.php?action=raw&title=Austria
|
21
|
+
# [debug] GET /w/index.php?action=raw&title=Austria uri=https://en.wikipedia.org/w/index.php?action=raw&title=Austria, redirect_limit=4
|
22
|
+
# [debug] 200 OK
|
23
|
+
|
15
24
|
text = page.text
|
16
25
|
|
17
26
|
## print first 600 chars
|
@@ -21,20 +30,25 @@ class TestPage < MiniTest::Test
|
|
21
30
|
assert /{{Infobox country/ =~ text
|
22
31
|
assert /common_name = Austria/ =~ text
|
23
32
|
assert /capital = \[\[Vienna\]\]/ =~ text
|
24
|
-
assert /The origins of modern-day Austria date back to the time/ =~ text
|
33
|
+
# assert /The origins of modern-day Austria date back to the time/ =~ text
|
25
34
|
end
|
26
35
|
|
27
36
|
def test_sankt_poelten_en
|
28
|
-
page = Wikiscript::Page.
|
37
|
+
page = Wikiscript::Page.get( 'Sankt_Pölten' )
|
38
|
+
# [debug] GET /w/index.php?action=raw&title=Sankt_P%C3%B6lten uri=http://en.wikipedia.org/w/index.php?action=raw&title=Sankt_P%C3%B6lten, redirect_limit=5
|
39
|
+
# [debug] 301 TLS Redirect location=https://en.wikipedia.org/w/index.php?action=raw&title=Sankt_P%C3%B6lten
|
40
|
+
# [debug] GET /w/index.php?action=raw&title=Sankt_P%C3%B6lten uri=https://en.wikipedia.org/w/index.php?action=raw&title=Sankt_P%C3%B6lten, redirect_limit=4
|
41
|
+
# [debug] 200 OK
|
42
|
+
|
29
43
|
text = page.text
|
30
44
|
|
31
45
|
## print first 600 chars
|
32
46
|
pp text[0..600]
|
33
47
|
|
34
48
|
## check for some snippets
|
35
|
-
assert /{{Infobox
|
36
|
-
assert /
|
37
|
-
assert /'''Sankt Pölten''' \(''St. Pölten''\) is the capital city of/ =~ text
|
49
|
+
assert /{{Infobox settlement/ =~ text
|
50
|
+
assert /name\s+=\s+Sankt Pölten/ =~ text
|
51
|
+
# assert /'''Sankt Pölten''' \(''St. Pölten''\) is the capital city of/ =~ text
|
38
52
|
end
|
39
53
|
|
40
54
|
end # class TestPage
|
data/test/test_page_de.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
|
4
|
+
###
|
5
|
+
# to run use
|
6
|
+
# ruby -I ./lib -I ./test test/test_page_de.rb
|
7
|
+
|
8
|
+
|
4
9
|
require 'helper'
|
5
10
|
|
6
11
|
|
@@ -11,7 +16,13 @@ class TestPageDe < MiniTest::Test
|
|
11
16
|
end
|
12
17
|
|
13
18
|
def test_st_poelten_de
|
14
|
-
page = Wikiscript::Page.
|
19
|
+
page = Wikiscript::Page.get( 'St._Pölten' )
|
20
|
+
# [debug] GET /w/index.php?action=raw&title=St._P%C3%B6lten uri=http://de.wikipedia.org/w/index.php?action=raw&title=St._P%C3%B6lten, redirect_limit=5
|
21
|
+
# [debug] 301 TLS Redirect location=https://de.wikipedia.org/w/index.php?action=raw&title=St._P%C3%B6lten
|
22
|
+
# [debug] GET /w/index.php?action=raw&title=St._P%C3%B6lten uri=https://de.wikipedia.org/w/index.php?action=raw&title=St._P%C3%B6lten, redirect_limit=4
|
23
|
+
# [debug] 200 OK
|
24
|
+
|
25
|
+
|
15
26
|
text = page.text
|
16
27
|
|
17
28
|
## print first 600 chars
|
data/test/test_page_reader.rb
CHANGED
@@ -11,7 +11,7 @@ require 'helper'
|
|
11
11
|
class TestPageReader < MiniTest::Test
|
12
12
|
|
13
13
|
def test_basic
|
14
|
-
el = Wikiscript
|
14
|
+
el = Wikiscript.parse( <<TXT )
|
15
15
|
=Heading 1==
|
16
16
|
==Heading 2==
|
17
17
|
===Heading 3===
|
@@ -44,7 +44,7 @@ TXT
|
|
44
44
|
end
|
45
45
|
|
46
46
|
def test_parse
|
47
|
-
page = Wikiscript::Page.new(
|
47
|
+
page = Wikiscript::Page.new( <<TXT )
|
48
48
|
=Heading 1==
|
49
49
|
==Heading 2==
|
50
50
|
===Heading 3===
|
data/test/test_table_reader.rb
CHANGED
@@ -10,7 +10,7 @@ require 'helper'
|
|
10
10
|
class TestTableReader < MiniTest::Test
|
11
11
|
|
12
12
|
def test_basic
|
13
|
-
|
13
|
+
table = Wikiscript.parse_table( <<TXT )
|
14
14
|
{|
|
15
15
|
|-
|
16
16
|
! header1
|
@@ -27,8 +27,6 @@ class TestTableReader < MiniTest::Test
|
|
27
27
|
|}
|
28
28
|
TXT
|
29
29
|
|
30
|
-
table = tables[0]
|
31
|
-
assert_equal 1, tables.size ## one table
|
32
30
|
assert_equal 3, table.size ## three rows
|
33
31
|
assert_equal ['header1', 'header2', 'header3'], table[0]
|
34
32
|
assert_equal ['row1cell1', 'row1cell2', 'row1cell3'], table[1]
|
@@ -36,7 +34,7 @@ TXT
|
|
36
34
|
end
|
37
35
|
|
38
36
|
def test_basic_ii ## with optional (missing) row divider before headers
|
39
|
-
|
37
|
+
table = Wikiscript.parse_table( <<TXT )
|
40
38
|
{|
|
41
39
|
! header1 !! header2 !! header3
|
42
40
|
|-
|
@@ -46,16 +44,48 @@ TXT
|
|
46
44
|
|}
|
47
45
|
TXT
|
48
46
|
|
49
|
-
table = tables[0]
|
50
|
-
assert_equal 1, tables.size ## one table
|
51
47
|
assert_equal 3, table.size ## three rows
|
52
48
|
assert_equal ['header1', 'header2', 'header3'], table[0]
|
53
49
|
assert_equal ['row1cell1', 'row1cell2', 'row1cell3'], table[1]
|
54
50
|
assert_equal ['row2cell1', 'row2cell2', 'row2cell3'], table[2]
|
55
51
|
end
|
56
52
|
|
53
|
+
def test_basic_iii # with continuing header column lines
|
54
|
+
table = Wikiscript.parse_table( <<TXT )
|
55
|
+
{|
|
56
|
+
|-
|
57
|
+
!
|
58
|
+
header1
|
59
|
+
!
|
60
|
+
header2
|
61
|
+
!
|
62
|
+
header3
|
63
|
+
|-
|
64
|
+
|
|
65
|
+
row1cell1
|
66
|
+
|
|
67
|
+
row1cell2
|
68
|
+
|
|
69
|
+
row1cell3
|
70
|
+
|-
|
71
|
+
|
|
72
|
+
row2cell1
|
73
|
+
|
|
74
|
+
row2cell2
|
75
|
+
|
|
76
|
+
row2cell3
|
77
|
+
|}
|
78
|
+
TXT
|
79
|
+
|
80
|
+
assert_equal 3, table.size ## three rows
|
81
|
+
assert_equal ['header1', 'header2', 'header3'], table[0]
|
82
|
+
assert_equal ['row1cell1', 'row1cell2', 'row1cell3'], table[1]
|
83
|
+
assert_equal ['row2cell1', 'row2cell2', 'row2cell3'], table[2]
|
84
|
+
end
|
85
|
+
|
86
|
+
|
57
87
|
def test_strip_attributes_and_emphases
|
58
|
-
|
88
|
+
table = Wikiscript.parse_table( <<TXT )
|
59
89
|
{|
|
60
90
|
|-
|
61
91
|
! style="width:200px;"|Club
|
@@ -69,8 +99,6 @@ TXT
|
|
69
99
|
|}
|
70
100
|
TXT
|
71
101
|
|
72
|
-
table = tables[0]
|
73
|
-
assert_equal 1, tables.size ## one table
|
74
102
|
assert_equal 4, table.size ## four rows
|
75
103
|
assert_equal ['Club', 'City'], table[0]
|
76
104
|
assert_equal ['[[Biu Chun Rangers]]', '[[Sham Shui Po]]'], table[1]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wikiscript
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-09-
|
11
|
+
date: 2019-09-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: logutils
|