wikiscript 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NOTES.md +30 -1
- data/lib/wikiscript.rb +19 -6
- data/lib/wikiscript/client.rb +12 -11
- data/lib/wikiscript/page.rb +13 -4
- data/lib/wikiscript/page_reader.rb +5 -3
- data/lib/wikiscript/table_reader.rb +20 -10
- data/lib/wikiscript/version.rb +1 -1
- data/test/test_page.rb +20 -6
- data/test/test_page_de.rb +12 -1
- data/test/test_page_reader.rb +2 -2
- data/test/test_table_reader.rb +37 -9
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fe20e2b58b4703144ef8a83d1cb41edf92f5ef3e
|
4
|
+
data.tar.gz: ae8a0d31b16099c33fee4ac450b27433852b8fb8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 61fe785e9ea9150f6a5711ff0038cb135aba18f719abf26d6783bf50c465678f7376c315bcce3091b325d8ac9ca4714e084f93503e21dae583cddb9c904863d5
|
7
|
+
data.tar.gz: c437c92ba5df517d21231060044ab690aba2b1b11a368d37018c65eb493cea467bbd3331f161f217c1764fc24025fcf97a57ab3889e5aa2fc6b53d57e460f13b
|
data/NOTES.md
CHANGED
@@ -3,4 +3,33 @@
|
|
3
3
|
|
4
4
|
## Alternatives
|
5
5
|
|
6
|
-
|
6
|
+
|
7
|
+
|
8
|
+
- [wikipedia-client](https://rubygems.org/gems/wikipedia-client) - by Ken Pratt et al - ruby client for the Wikipedia API
|
9
|
+
- <https://github.com/kenpratt/wikipedia-client>
|
10
|
+
- <https://www.rubydoc.info/gems/wikipedia-client>
|
11
|
+
|
12
|
+
<!-- break -->
|
13
|
+
|
14
|
+
- [infoboxer](https://rubygems.org/gems/infoboxer) - by Victor Shepelev et al - pure-Ruby Wikipedia (and generic MediaWiki) client and parser, targeting information extraction
|
15
|
+
- <https://github.com/molybdenum-99/infoboxer>
|
16
|
+
- <https://www.rubydoc.info/gems/infoboxer>
|
17
|
+
|
18
|
+
<!-- break -->
|
19
|
+
|
20
|
+
More
|
21
|
+
|
22
|
+
- <https://github.com/molybdenum-99/reality>
|
23
|
+
- https://github.com/molybdenum-99/mediawiktory
|
24
|
+
|
25
|
+
**Python**
|
26
|
+
|
27
|
+
- <https://pypi.org/project/wptools/> - Wikipedia tools (for Humans)
|
28
|
+
- <https://github.com/siznax/wptools/>
|
29
|
+
|
30
|
+
|
31
|
+
## Wikipedia
|
32
|
+
|
33
|
+
- Wikipedia API reference: <http://en.wikipedia.org/w/api.php>
|
34
|
+
|
35
|
+
|
data/lib/wikiscript.rb
CHANGED
@@ -51,11 +51,11 @@ module Wikiscript
|
|
51
51
|
}x
|
52
52
|
|
53
53
|
|
54
|
-
def self.unlink(
|
54
|
+
def self.unlink( text )
|
55
55
|
## replace ALL wiki links with title (or link)
|
56
56
|
## e.g. [[Santiago]] ([[La Florida, Chile|La Florida]])
|
57
57
|
## => Santiago (La Florida)
|
58
|
-
|
58
|
+
text = text.gsub( LINK_PATTERN ) do |_|
|
59
59
|
link = $~[:link]
|
60
60
|
title = $~[:title]
|
61
61
|
|
@@ -66,14 +66,16 @@ module Wikiscript
|
|
66
66
|
end
|
67
67
|
end
|
68
68
|
|
69
|
-
|
69
|
+
text.strip
|
70
|
+
end
|
71
|
+
class << self
|
72
|
+
alias_method :flatten_links, :unlink
|
70
73
|
end
|
71
74
|
|
72
|
-
|
73
|
-
def self.parse_link( value ) ## todo/change: find a better name - use match_link/etc. - why? why not?
|
75
|
+
def self.parse_link( text ) ## todo/change: find a better name - use match_link/etc. - why? why not?
|
74
76
|
## find first matching link
|
75
77
|
## return [nil,nil] if nothing found
|
76
|
-
if (m = LINK_PATTERN.match(
|
78
|
+
if (m = LINK_PATTERN.match( text ))
|
77
79
|
link = m[:link]
|
78
80
|
title = m[:title]
|
79
81
|
|
@@ -85,6 +87,17 @@ module Wikiscript
|
|
85
87
|
end
|
86
88
|
end
|
87
89
|
|
90
|
+
############################
|
91
|
+
## more convenience shortcuts / helpers
|
92
|
+
def self.parse( text ) PageReader.parse( text ); end
|
93
|
+
def self.parse_table( text ) TableReader.parse_table( text ); end
|
94
|
+
|
95
|
+
def self.get( title, lang: Wikiscript.lang ) Page.get( title, lang: lang ); end
|
96
|
+
class << self
|
97
|
+
alias_method :fetch, :get
|
98
|
+
alias_method :download, :get
|
99
|
+
end
|
100
|
+
|
88
101
|
end # module Wikiscript
|
89
102
|
|
90
103
|
|
data/lib/wikiscript/client.rb
CHANGED
@@ -15,30 +15,30 @@ module Wikiscript
|
|
15
15
|
@worker = Fetcher::Worker.new
|
16
16
|
end
|
17
17
|
|
18
|
-
## change to: wikitext why? why not? or to raw? why? why not?
|
19
|
-
def text( title )
|
18
|
+
## change to: wikitext or raw why? why not? or to raw? why? why not?
|
19
|
+
def text( title, lang: Wikiscript.lang )
|
20
20
|
## todo/fix: convert spaces to _ if not present for wikipedia page title - why ?? why not ???
|
21
|
-
get( action: 'raw', title: title )
|
22
|
-
end
|
23
21
|
|
24
|
-
|
25
|
-
def site_base
|
26
|
-
## replace lang w/ lang config if present e.g.
|
22
|
+
## note: replace lang w/ lang config if present e.g.
|
27
23
|
## http://{lang}.wikipedia.org/w/index.php
|
28
24
|
# becomes
|
29
25
|
# http://en.wikipedia.org/w/index.php or
|
30
26
|
# http://de.wikipedia.org/w/index.php etc
|
27
|
+
base_url = SITE_BASE.gsub( "{lang}", lang )
|
28
|
+
params = { action: 'raw',
|
29
|
+
title: title }
|
31
30
|
|
32
|
-
|
31
|
+
get( base_url, params )
|
33
32
|
end
|
34
33
|
|
34
|
+
private
|
35
35
|
def build_query( h )
|
36
36
|
h.map do |k,v|
|
37
37
|
"#{CGI.escape(k.to_s)}=#{CGI.escape(v.to_s)}"
|
38
38
|
end.join( '&' )
|
39
39
|
end
|
40
40
|
|
41
|
-
def get( params )
|
41
|
+
def get( base_url, params )
|
42
42
|
# note: lets us passing in params as hash e.g.
|
43
43
|
# action: 'raw', title: 'Austria'
|
44
44
|
# key and values will get CGI escaped
|
@@ -46,7 +46,7 @@ private
|
|
46
46
|
|
47
47
|
## uri = URI.parse( "#{SITE_BASE}?#{params}" )
|
48
48
|
## fix: pass in uri (add to fetcher check for is_a? URI etc.)
|
49
|
-
uri_string = "#{
|
49
|
+
uri_string = "#{base_url}?#{query}"
|
50
50
|
|
51
51
|
response = @worker.get_response( uri_string )
|
52
52
|
|
@@ -69,7 +69,8 @@ private
|
|
69
69
|
t
|
70
70
|
else
|
71
71
|
logger.error "fetch HTTP - #{response.code} #{response.message}"
|
72
|
-
|
72
|
+
exit 1 ### exit for now on error - why? why not?
|
73
|
+
## nil
|
73
74
|
end
|
74
75
|
end
|
75
76
|
|
data/lib/wikiscript/page.rb
CHANGED
@@ -6,15 +6,24 @@ module Wikiscript
|
|
6
6
|
|
7
7
|
include LogUtils::Logging
|
8
8
|
|
9
|
-
attr_reader :title
|
9
|
+
attr_reader :title, :lang
|
10
10
|
|
11
|
-
|
11
|
+
|
12
|
+
def self.get( title, lang: Wikiscript.lang ) ## todo/check: add a fetch/download alias - why? why not?
|
13
|
+
o = new( title: title, lang: lang )
|
14
|
+
## o.text ## "force" download / fetch
|
15
|
+
o
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
def initialize( text=nil, title: nil, lang: Wikiscript.lang )
|
12
20
|
## todo: check title
|
13
21
|
## replace title spaces w/ _ ????
|
14
22
|
## to allow "pretty" titles - why? why not??
|
15
23
|
|
16
|
-
@title = title
|
17
24
|
@text = text
|
25
|
+
@title = title
|
26
|
+
@lang = lang
|
18
27
|
end
|
19
28
|
|
20
29
|
def text
|
@@ -22,7 +31,7 @@ module Wikiscript
|
|
22
31
|
end
|
23
32
|
|
24
33
|
def download_text
|
25
|
-
Client.new.text( @title )
|
34
|
+
Client.new.text( @title, lang: @lang )
|
26
35
|
end
|
27
36
|
|
28
37
|
def parse ## todo/change: use/find a different name e.g. doc/elements/etc. - why? why not?
|
@@ -53,9 +53,11 @@ class PageReader
|
|
53
53
|
elsif inside_table
|
54
54
|
table_txt << line << "\n"
|
55
55
|
else
|
56
|
-
|
57
|
-
|
58
|
-
|
56
|
+
## note: skip unknown line types for now
|
57
|
+
|
58
|
+
## puts "** !!! ERROR !!! unknown line type in wiki page:"
|
59
|
+
## pp line
|
60
|
+
## exit 1
|
59
61
|
end
|
60
62
|
end
|
61
63
|
page
|
@@ -65,21 +65,31 @@ class TableReader
|
|
65
65
|
row = []
|
66
66
|
rows << row
|
67
67
|
end
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
68
|
+
if values.empty?
|
69
|
+
## note: support continuing column text in next line
|
70
|
+
row << String.new
|
71
|
+
else
|
72
|
+
## add each value one-by-one for now (to keep (same) row reference)
|
73
|
+
## note: also strip leading (optional) attributes
|
74
|
+
values.each do |value|
|
75
|
+
row << strip_emphases( strip_attributes( value.strip ))
|
76
|
+
end
|
72
77
|
end
|
73
78
|
elsif inside_table && line.start_with?( '|' ) ## table data
|
74
79
|
values = line.sub( '|', '' ).strip.split( '||' )
|
75
|
-
|
76
|
-
|
77
|
-
row <<
|
80
|
+
if values.empty?
|
81
|
+
## note: support continuing column text in next line
|
82
|
+
row << String.new
|
83
|
+
else
|
84
|
+
## add each value one-by-one for now (to keep (same) row reference)
|
85
|
+
values.each do |value|
|
86
|
+
row << strip_emphases( strip_attributes( value.strip ))
|
87
|
+
end
|
78
88
|
end
|
79
89
|
elsif inside_table
|
80
|
-
|
81
|
-
|
82
|
-
|
90
|
+
## note: support continuing column text in next line
|
91
|
+
## todo/check: for now doesn't support multi-line just simple continuing line - fix later if needed!!!
|
92
|
+
row[-1] << line
|
83
93
|
else
|
84
94
|
puts "!! ERROR !! unknown line type outside (before or after) table:"
|
85
95
|
puts line
|
data/lib/wikiscript/version.rb
CHANGED
data/test/test_page.rb
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
+
###
|
4
|
+
# to run use
|
5
|
+
# ruby -I ./lib -I ./test test/test_page.rb
|
6
|
+
|
3
7
|
|
4
8
|
require 'helper'
|
5
9
|
|
@@ -11,7 +15,12 @@ class TestPage < MiniTest::Test
|
|
11
15
|
end
|
12
16
|
|
13
17
|
def test_austria_en
|
14
|
-
page = Wikiscript::Page.
|
18
|
+
page = Wikiscript::Page.get( 'Austria' )
|
19
|
+
# [debug] GET /w/index.php?action=raw&title=Austria uri=http://en.wikipedia.org/w/index.php?action=raw&title=Austria, redirect_limit=5
|
20
|
+
# [debug] 301 TLS Redirect location=https://en.wikipedia.org/w/index.php?action=raw&title=Austria
|
21
|
+
# [debug] GET /w/index.php?action=raw&title=Austria uri=https://en.wikipedia.org/w/index.php?action=raw&title=Austria, redirect_limit=4
|
22
|
+
# [debug] 200 OK
|
23
|
+
|
15
24
|
text = page.text
|
16
25
|
|
17
26
|
## print first 600 chars
|
@@ -21,20 +30,25 @@ class TestPage < MiniTest::Test
|
|
21
30
|
assert /{{Infobox country/ =~ text
|
22
31
|
assert /common_name = Austria/ =~ text
|
23
32
|
assert /capital = \[\[Vienna\]\]/ =~ text
|
24
|
-
assert /The origins of modern-day Austria date back to the time/ =~ text
|
33
|
+
# assert /The origins of modern-day Austria date back to the time/ =~ text
|
25
34
|
end
|
26
35
|
|
27
36
|
def test_sankt_poelten_en
|
28
|
-
page = Wikiscript::Page.
|
37
|
+
page = Wikiscript::Page.get( 'Sankt_Pölten' )
|
38
|
+
# [debug] GET /w/index.php?action=raw&title=Sankt_P%C3%B6lten uri=http://en.wikipedia.org/w/index.php?action=raw&title=Sankt_P%C3%B6lten, redirect_limit=5
|
39
|
+
# [debug] 301 TLS Redirect location=https://en.wikipedia.org/w/index.php?action=raw&title=Sankt_P%C3%B6lten
|
40
|
+
# [debug] GET /w/index.php?action=raw&title=Sankt_P%C3%B6lten uri=https://en.wikipedia.org/w/index.php?action=raw&title=Sankt_P%C3%B6lten, redirect_limit=4
|
41
|
+
# [debug] 200 OK
|
42
|
+
|
29
43
|
text = page.text
|
30
44
|
|
31
45
|
## print first 600 chars
|
32
46
|
pp text[0..600]
|
33
47
|
|
34
48
|
## check for some snippets
|
35
|
-
assert /{{Infobox
|
36
|
-
assert /
|
37
|
-
assert /'''Sankt Pölten''' \(''St. Pölten''\) is the capital city of/ =~ text
|
49
|
+
assert /{{Infobox settlement/ =~ text
|
50
|
+
assert /name\s+=\s+Sankt Pölten/ =~ text
|
51
|
+
# assert /'''Sankt Pölten''' \(''St. Pölten''\) is the capital city of/ =~ text
|
38
52
|
end
|
39
53
|
|
40
54
|
end # class TestPage
|
data/test/test_page_de.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
|
4
|
+
###
|
5
|
+
# to run use
|
6
|
+
# ruby -I ./lib -I ./test test/test_page_de.rb
|
7
|
+
|
8
|
+
|
4
9
|
require 'helper'
|
5
10
|
|
6
11
|
|
@@ -11,7 +16,13 @@ class TestPageDe < MiniTest::Test
|
|
11
16
|
end
|
12
17
|
|
13
18
|
def test_st_poelten_de
|
14
|
-
page = Wikiscript::Page.
|
19
|
+
page = Wikiscript::Page.get( 'St._Pölten' )
|
20
|
+
# [debug] GET /w/index.php?action=raw&title=St._P%C3%B6lten uri=http://de.wikipedia.org/w/index.php?action=raw&title=St._P%C3%B6lten, redirect_limit=5
|
21
|
+
# [debug] 301 TLS Redirect location=https://de.wikipedia.org/w/index.php?action=raw&title=St._P%C3%B6lten
|
22
|
+
# [debug] GET /w/index.php?action=raw&title=St._P%C3%B6lten uri=https://de.wikipedia.org/w/index.php?action=raw&title=St._P%C3%B6lten, redirect_limit=4
|
23
|
+
# [debug] 200 OK
|
24
|
+
|
25
|
+
|
15
26
|
text = page.text
|
16
27
|
|
17
28
|
## print first 600 chars
|
data/test/test_page_reader.rb
CHANGED
@@ -11,7 +11,7 @@ require 'helper'
|
|
11
11
|
class TestPageReader < MiniTest::Test
|
12
12
|
|
13
13
|
def test_basic
|
14
|
-
el = Wikiscript
|
14
|
+
el = Wikiscript.parse( <<TXT )
|
15
15
|
=Heading 1==
|
16
16
|
==Heading 2==
|
17
17
|
===Heading 3===
|
@@ -44,7 +44,7 @@ TXT
|
|
44
44
|
end
|
45
45
|
|
46
46
|
def test_parse
|
47
|
-
page = Wikiscript::Page.new(
|
47
|
+
page = Wikiscript::Page.new( <<TXT )
|
48
48
|
=Heading 1==
|
49
49
|
==Heading 2==
|
50
50
|
===Heading 3===
|
data/test/test_table_reader.rb
CHANGED
@@ -10,7 +10,7 @@ require 'helper'
|
|
10
10
|
class TestTableReader < MiniTest::Test
|
11
11
|
|
12
12
|
def test_basic
|
13
|
-
|
13
|
+
table = Wikiscript.parse_table( <<TXT )
|
14
14
|
{|
|
15
15
|
|-
|
16
16
|
! header1
|
@@ -27,8 +27,6 @@ class TestTableReader < MiniTest::Test
|
|
27
27
|
|}
|
28
28
|
TXT
|
29
29
|
|
30
|
-
table = tables[0]
|
31
|
-
assert_equal 1, tables.size ## one table
|
32
30
|
assert_equal 3, table.size ## three rows
|
33
31
|
assert_equal ['header1', 'header2', 'header3'], table[0]
|
34
32
|
assert_equal ['row1cell1', 'row1cell2', 'row1cell3'], table[1]
|
@@ -36,7 +34,7 @@ TXT
|
|
36
34
|
end
|
37
35
|
|
38
36
|
def test_basic_ii ## with optional (missing) row divider before headers
|
39
|
-
|
37
|
+
table = Wikiscript.parse_table( <<TXT )
|
40
38
|
{|
|
41
39
|
! header1 !! header2 !! header3
|
42
40
|
|-
|
@@ -46,16 +44,48 @@ TXT
|
|
46
44
|
|}
|
47
45
|
TXT
|
48
46
|
|
49
|
-
table = tables[0]
|
50
|
-
assert_equal 1, tables.size ## one table
|
51
47
|
assert_equal 3, table.size ## three rows
|
52
48
|
assert_equal ['header1', 'header2', 'header3'], table[0]
|
53
49
|
assert_equal ['row1cell1', 'row1cell2', 'row1cell3'], table[1]
|
54
50
|
assert_equal ['row2cell1', 'row2cell2', 'row2cell3'], table[2]
|
55
51
|
end
|
56
52
|
|
53
|
+
def test_basic_iii # with continuing header column lines
|
54
|
+
table = Wikiscript.parse_table( <<TXT )
|
55
|
+
{|
|
56
|
+
|-
|
57
|
+
!
|
58
|
+
header1
|
59
|
+
!
|
60
|
+
header2
|
61
|
+
!
|
62
|
+
header3
|
63
|
+
|-
|
64
|
+
|
|
65
|
+
row1cell1
|
66
|
+
|
|
67
|
+
row1cell2
|
68
|
+
|
|
69
|
+
row1cell3
|
70
|
+
|-
|
71
|
+
|
|
72
|
+
row2cell1
|
73
|
+
|
|
74
|
+
row2cell2
|
75
|
+
|
|
76
|
+
row2cell3
|
77
|
+
|}
|
78
|
+
TXT
|
79
|
+
|
80
|
+
assert_equal 3, table.size ## three rows
|
81
|
+
assert_equal ['header1', 'header2', 'header3'], table[0]
|
82
|
+
assert_equal ['row1cell1', 'row1cell2', 'row1cell3'], table[1]
|
83
|
+
assert_equal ['row2cell1', 'row2cell2', 'row2cell3'], table[2]
|
84
|
+
end
|
85
|
+
|
86
|
+
|
57
87
|
def test_strip_attributes_and_emphases
|
58
|
-
|
88
|
+
table = Wikiscript.parse_table( <<TXT )
|
59
89
|
{|
|
60
90
|
|-
|
61
91
|
! style="width:200px;"|Club
|
@@ -69,8 +99,6 @@ TXT
|
|
69
99
|
|}
|
70
100
|
TXT
|
71
101
|
|
72
|
-
table = tables[0]
|
73
|
-
assert_equal 1, tables.size ## one table
|
74
102
|
assert_equal 4, table.size ## four rows
|
75
103
|
assert_equal ['Club', 'City'], table[0]
|
76
104
|
assert_equal ['[[Biu Chun Rangers]]', '[[Sham Shui Po]]'], table[1]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wikiscript
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-09-
|
11
|
+
date: 2019-09-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: logutils
|