wikiscript 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e86e3e1b50b44067a5815a13155b04aa441e8023
4
- data.tar.gz: c38ed306346a9b3d2c7c365fba00c41bee4624b4
3
+ metadata.gz: fe20e2b58b4703144ef8a83d1cb41edf92f5ef3e
4
+ data.tar.gz: ae8a0d31b16099c33fee4ac450b27433852b8fb8
5
5
  SHA512:
6
- metadata.gz: 290c18216f59d7c2e6f2e1b181651413790fd2f7ff709c7f6f9c9cf1708b540480efa8827c57613df6862b1da755cd584c53f71ff93b07fe34a7a2ef4a67420d
7
- data.tar.gz: 9e0c5c402b17263d38065eaac1a9e1febee6f3a73dff382aae8d81a8d0436881a2437a784dcee83f69c50d1fb5c6c5f385458a90ebb2cff8a31d31fffbd951a5
6
+ metadata.gz: 61fe785e9ea9150f6a5711ff0038cb135aba18f719abf26d6783bf50c465678f7376c315bcce3091b325d8ac9ca4714e084f93503e21dae583cddb9c904863d5
7
+ data.tar.gz: c437c92ba5df517d21231060044ab690aba2b1b11a368d37018c65eb493cea467bbd3331f161f217c1764fc24025fcf97a57ab3889e5aa2fc6b53d57e460f13b
data/NOTES.md CHANGED
@@ -3,4 +3,33 @@
3
3
 
4
4
  ## Alternatives
5
5
 
6
- TBD
6
+
7
+
8
+ - [wikipedia-client](https://rubygems.org/gems/wikipedia-client) - by Ken Pratt et al - ruby client for the Wikipedia API
9
+ - <https://github.com/kenpratt/wikipedia-client>
10
+ - <https://www.rubydoc.info/gems/wikipedia-client>
11
+
12
+ <!-- break -->
13
+
14
+ - [infoboxer](https://rubygems.org/gems/infoboxer) - by Victor Shepelev et al - pure-Ruby Wikipedia (and generic MediaWiki) client and parser, targeting information extraction
15
+ - <https://github.com/molybdenum-99/infoboxer>
16
+ - <https://www.rubydoc.info/gems/infoboxer>
17
+
18
+ <!-- break -->
19
+
20
+ More
21
+
22
+ - <https://github.com/molybdenum-99/reality>
23
+ - https://github.com/molybdenum-99/mediawiktory
24
+
25
+ **Python**
26
+
27
+ - <https://pypi.org/project/wptools/> - Wikipedia tools (for Humans)
28
+ - <https://github.com/siznax/wptools/>
29
+
30
+
31
+ ## Wikipedia
32
+
33
+ - Wikipedia API reference: <http://en.wikipedia.org/w/api.php>
34
+
35
+
data/lib/wikiscript.rb CHANGED
@@ -51,11 +51,11 @@ module Wikiscript
51
51
  }x
52
52
 
53
53
 
54
- def self.unlink( value )
54
+ def self.unlink( text )
55
55
  ## replace ALL wiki links with title (or link)
56
56
  ## e.g. [[Santiago]] ([[La Florida, Chile|La Florida]])
57
57
  ## => Santiago (La Florida)
58
- value = value.gsub( LINK_PATTERN ) do |_|
58
+ text = text.gsub( LINK_PATTERN ) do |_|
59
59
  link = $~[:link]
60
60
  title = $~[:title]
61
61
 
@@ -66,14 +66,16 @@ module Wikiscript
66
66
  end
67
67
  end
68
68
 
69
- value.strip
69
+ text.strip
70
+ end
71
+ class << self
72
+ alias_method :flatten_links, :unlink
70
73
  end
71
74
 
72
-
73
- def self.parse_link( value ) ## todo/change: find a better name - use match_link/etc. - why? why not?
75
+ def self.parse_link( text ) ## todo/change: find a better name - use match_link/etc. - why? why not?
74
76
  ## find first matching link
75
77
  ## return [nil,nil] if nothing found
76
- if (m = LINK_PATTERN.match( value ))
78
+ if (m = LINK_PATTERN.match( text ))
77
79
  link = m[:link]
78
80
  title = m[:title]
79
81
 
@@ -85,6 +87,17 @@ module Wikiscript
85
87
  end
86
88
  end
87
89
 
90
+ ############################
91
+ ## more convenience shortcuts / helpers
92
+ def self.parse( text ) PageReader.parse( text ); end
93
+ def self.parse_table( text ) TableReader.parse_table( text ); end
94
+
95
+ def self.get( title, lang: Wikiscript.lang ) Page.get( title, lang: lang ); end
96
+ class << self
97
+ alias_method :fetch, :get
98
+ alias_method :download, :get
99
+ end
100
+
88
101
  end # module Wikiscript
89
102
 
90
103
 
@@ -15,30 +15,30 @@ module Wikiscript
15
15
  @worker = Fetcher::Worker.new
16
16
  end
17
17
 
18
- ## change to: wikitext why? why not? or to raw? why? why not?
19
- def text( title )
18
+ ## change to: wikitext or raw why? why not? or to raw? why? why not?
19
+ def text( title, lang: Wikiscript.lang )
20
20
  ## todo/fix: convert spaces to _ if not present for wikipedia page title - why ?? why not ???
21
- get( action: 'raw', title: title )
22
- end
23
21
 
24
- private
25
- def site_base
26
- ## replace lang w/ lang config if present e.g.
22
+ ## note: replace lang w/ lang config if present e.g.
27
23
  ## http://{lang}.wikipedia.org/w/index.php
28
24
  # becomes
29
25
  # http://en.wikipedia.org/w/index.php or
30
26
  # http://de.wikipedia.org/w/index.php etc
27
+ base_url = SITE_BASE.gsub( "{lang}", lang )
28
+ params = { action: 'raw',
29
+ title: title }
31
30
 
32
- SITE_BASE.gsub( "{lang}", Wikiscript.lang )
31
+ get( base_url, params )
33
32
  end
34
33
 
34
+ private
35
35
  def build_query( h )
36
36
  h.map do |k,v|
37
37
  "#{CGI.escape(k.to_s)}=#{CGI.escape(v.to_s)}"
38
38
  end.join( '&' )
39
39
  end
40
40
 
41
- def get( params )
41
+ def get( base_url, params )
42
42
  # note: lets us passing in params as hash e.g.
43
43
  # action: 'raw', title: 'Austria'
44
44
  # key and values will get CGI escaped
@@ -46,7 +46,7 @@ private
46
46
 
47
47
  ## uri = URI.parse( "#{SITE_BASE}?#{params}" )
48
48
  ## fix: pass in uri (add to fetcher check for is_a? URI etc.)
49
- uri_string = "#{site_base}?#{query}"
49
+ uri_string = "#{base_url}?#{query}"
50
50
 
51
51
  response = @worker.get_response( uri_string )
52
52
 
@@ -69,7 +69,8 @@ private
69
69
  t
70
70
  else
71
71
  logger.error "fetch HTTP - #{response.code} #{response.message}"
72
- nil
72
+ exit 1 ### exit for now on error - why? why not?
73
+ ## nil
73
74
  end
74
75
  end
75
76
 
@@ -6,15 +6,24 @@ module Wikiscript
6
6
 
7
7
  include LogUtils::Logging
8
8
 
9
- attr_reader :title
9
+ attr_reader :title, :lang
10
10
 
11
- def initialize( title, text: nil )
11
+
12
+ def self.get( title, lang: Wikiscript.lang ) ## todo/check: add a fetch/download alias - why? why not?
13
+ o = new( title: title, lang: lang )
14
+ ## o.text ## "force" download / fetch
15
+ o
16
+ end
17
+
18
+
19
+ def initialize( text=nil, title: nil, lang: Wikiscript.lang )
12
20
  ## todo: check title
13
21
  ## replace title spaces w/ _ ????
14
22
  ## to allow "pretty" titles - why? why not??
15
23
 
16
- @title = title
17
24
  @text = text
25
+ @title = title
26
+ @lang = lang
18
27
  end
19
28
 
20
29
  def text
@@ -22,7 +31,7 @@ module Wikiscript
22
31
  end
23
32
 
24
33
  def download_text
25
- Client.new.text( @title )
34
+ Client.new.text( @title, lang: @lang )
26
35
  end
27
36
 
28
37
  def parse ## todo/change: use/find a different name e.g. doc/elements/etc. - why? why not?
@@ -53,9 +53,11 @@ class PageReader
53
53
  elsif inside_table
54
54
  table_txt << line << "\n"
55
55
  else
56
- puts "** !!! ERROR !!! unknown line type in wiki page:"
57
- pp line
58
- exit 1
56
+ ## note: skip unknown line types for now
57
+
58
+ ## puts "** !!! ERROR !!! unknown line type in wiki page:"
59
+ ## pp line
60
+ ## exit 1
59
61
  end
60
62
  end
61
63
  page
@@ -65,21 +65,31 @@ class TableReader
65
65
  row = []
66
66
  rows << row
67
67
  end
68
- ## add each value one-by-one for now (to keep (same) row reference)
69
- ## note: also strip leading (optional) attributes
70
- values.each do |value|
71
- row << strip_emphases( strip_attributes( value.strip ))
68
+ if values.empty?
69
+ ## note: support continuing column text in next line
70
+ row << String.new
71
+ else
72
+ ## add each value one-by-one for now (to keep (same) row reference)
73
+ ## note: also strip leading (optional) attributes
74
+ values.each do |value|
75
+ row << strip_emphases( strip_attributes( value.strip ))
76
+ end
72
77
  end
73
78
  elsif inside_table && line.start_with?( '|' ) ## table data
74
79
  values = line.sub( '|', '' ).strip.split( '||' )
75
- ## add each value one-by-one for now (to keep (same) row reference)
76
- values.each do |value|
77
- row << strip_emphases( strip_attributes( value.strip ))
80
+ if values.empty?
81
+ ## note: support continuing column text in next line
82
+ row << String.new
83
+ else
84
+ ## add each value one-by-one for now (to keep (same) row reference)
85
+ values.each do |value|
86
+ row << strip_emphases( strip_attributes( value.strip ))
87
+ end
78
88
  end
79
89
  elsif inside_table
80
- puts "!! ERROR !! unknown line type inside table:"
81
- puts line
82
- exit 1
90
+ ## note: support continuing column text in next line
91
+ ## todo/check: for now doesn't support multi-line just simple continuing line - fix later if needed!!!
92
+ row[-1] << line
83
93
  else
84
94
  puts "!! ERROR !! unknown line type outside (before or after) table:"
85
95
  puts line
@@ -1,6 +1,6 @@
1
1
 
2
2
  module Wikiscript
3
- VERSION = '0.2.0'
3
+ VERSION = '0.3.0'
4
4
 
5
5
  def self.banner
6
6
  "wikiscript/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
data/test/test_page.rb CHANGED
@@ -1,5 +1,9 @@
1
1
  # encoding: utf-8
2
2
 
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_page.rb
6
+
3
7
 
4
8
  require 'helper'
5
9
 
@@ -11,7 +15,12 @@ class TestPage < MiniTest::Test
11
15
  end
12
16
 
13
17
  def test_austria_en
14
- page = Wikiscript::Page.new( 'Austria' )
18
+ page = Wikiscript::Page.get( 'Austria' )
19
+ # [debug] GET /w/index.php?action=raw&title=Austria uri=http://en.wikipedia.org/w/index.php?action=raw&title=Austria, redirect_limit=5
20
+ # [debug] 301 TLS Redirect location=https://en.wikipedia.org/w/index.php?action=raw&title=Austria
21
+ # [debug] GET /w/index.php?action=raw&title=Austria uri=https://en.wikipedia.org/w/index.php?action=raw&title=Austria, redirect_limit=4
22
+ # [debug] 200 OK
23
+
15
24
  text = page.text
16
25
 
17
26
  ## print first 600 chars
@@ -21,20 +30,25 @@ class TestPage < MiniTest::Test
21
30
  assert /{{Infobox country/ =~ text
22
31
  assert /common_name = Austria/ =~ text
23
32
  assert /capital = \[\[Vienna\]\]/ =~ text
24
- assert /The origins of modern-day Austria date back to the time/ =~ text
33
+ # assert /The origins of modern-day Austria date back to the time/ =~ text
25
34
  end
26
35
 
27
36
  def test_sankt_poelten_en
28
- page = Wikiscript::Page.new( 'Sankt_Pölten' )
37
+ page = Wikiscript::Page.get( 'Sankt_Pölten' )
38
+ # [debug] GET /w/index.php?action=raw&title=Sankt_P%C3%B6lten uri=http://en.wikipedia.org/w/index.php?action=raw&title=Sankt_P%C3%B6lten, redirect_limit=5
39
+ # [debug] 301 TLS Redirect location=https://en.wikipedia.org/w/index.php?action=raw&title=Sankt_P%C3%B6lten
40
+ # [debug] GET /w/index.php?action=raw&title=Sankt_P%C3%B6lten uri=https://en.wikipedia.org/w/index.php?action=raw&title=Sankt_P%C3%B6lten, redirect_limit=4
41
+ # [debug] 200 OK
42
+
29
43
  text = page.text
30
44
 
31
45
  ## print first 600 chars
32
46
  pp text[0..600]
33
47
 
34
48
  ## check for some snippets
35
- assert /{{Infobox Town AT/ =~ text
36
- assert /Name\s+=\s+Sankt Pölten/ =~ text
37
- assert /'''Sankt Pölten''' \(''St. Pölten''\) is the capital city of/ =~ text
49
+ assert /{{Infobox settlement/ =~ text
50
+ assert /name\s+=\s+Sankt Pölten/ =~ text
51
+ # assert /'''Sankt Pölten''' \(''St. Pölten''\) is the capital city of/ =~ text
38
52
  end
39
53
 
40
54
  end # class TestPage
data/test/test_page_de.rb CHANGED
@@ -1,6 +1,11 @@
1
1
  # encoding: utf-8
2
2
 
3
3
 
4
+ ###
5
+ # to run use
6
+ # ruby -I ./lib -I ./test test/test_page_de.rb
7
+
8
+
4
9
  require 'helper'
5
10
 
6
11
 
@@ -11,7 +16,13 @@ class TestPageDe < MiniTest::Test
11
16
  end
12
17
 
13
18
  def test_st_poelten_de
14
- page = Wikiscript::Page.new( 'St._Pölten' )
19
+ page = Wikiscript::Page.get( 'St._Pölten' )
20
+ # [debug] GET /w/index.php?action=raw&title=St._P%C3%B6lten uri=http://de.wikipedia.org/w/index.php?action=raw&title=St._P%C3%B6lten, redirect_limit=5
21
+ # [debug] 301 TLS Redirect location=https://de.wikipedia.org/w/index.php?action=raw&title=St._P%C3%B6lten
22
+ # [debug] GET /w/index.php?action=raw&title=St._P%C3%B6lten uri=https://de.wikipedia.org/w/index.php?action=raw&title=St._P%C3%B6lten, redirect_limit=4
23
+ # [debug] 200 OK
24
+
25
+
15
26
  text = page.text
16
27
 
17
28
  ## print first 600 chars
@@ -11,7 +11,7 @@ require 'helper'
11
11
  class TestPageReader < MiniTest::Test
12
12
 
13
13
  def test_basic
14
- el = Wikiscript::PageReader.parse( <<TXT )
14
+ el = Wikiscript.parse( <<TXT )
15
15
  =Heading 1==
16
16
  ==Heading 2==
17
17
  ===Heading 3===
@@ -44,7 +44,7 @@ TXT
44
44
  end
45
45
 
46
46
  def test_parse
47
- page = Wikiscript::Page.new( 'Test', text: <<TXT )
47
+ page = Wikiscript::Page.new( <<TXT )
48
48
  =Heading 1==
49
49
  ==Heading 2==
50
50
  ===Heading 3===
@@ -10,7 +10,7 @@ require 'helper'
10
10
  class TestTableReader < MiniTest::Test
11
11
 
12
12
  def test_basic
13
- tables = Wikiscript::TableReader.parse( <<TXT )
13
+ table = Wikiscript.parse_table( <<TXT )
14
14
  {|
15
15
  |-
16
16
  ! header1
@@ -27,8 +27,6 @@ class TestTableReader < MiniTest::Test
27
27
  |}
28
28
  TXT
29
29
 
30
- table = tables[0]
31
- assert_equal 1, tables.size ## one table
32
30
  assert_equal 3, table.size ## three rows
33
31
  assert_equal ['header1', 'header2', 'header3'], table[0]
34
32
  assert_equal ['row1cell1', 'row1cell2', 'row1cell3'], table[1]
@@ -36,7 +34,7 @@ TXT
36
34
  end
37
35
 
38
36
  def test_basic_ii ## with optional (missing) row divider before headers
39
- tables = Wikiscript::TableReader.parse( <<TXT )
37
+ table = Wikiscript.parse_table( <<TXT )
40
38
  {|
41
39
  ! header1 !! header2 !! header3
42
40
  |-
@@ -46,16 +44,48 @@ TXT
46
44
  |}
47
45
  TXT
48
46
 
49
- table = tables[0]
50
- assert_equal 1, tables.size ## one table
51
47
  assert_equal 3, table.size ## three rows
52
48
  assert_equal ['header1', 'header2', 'header3'], table[0]
53
49
  assert_equal ['row1cell1', 'row1cell2', 'row1cell3'], table[1]
54
50
  assert_equal ['row2cell1', 'row2cell2', 'row2cell3'], table[2]
55
51
  end
56
52
 
53
+ def test_basic_iii # with continuing header column lines
54
+ table = Wikiscript.parse_table( <<TXT )
55
+ {|
56
+ |-
57
+ !
58
+ header1
59
+ !
60
+ header2
61
+ !
62
+ header3
63
+ |-
64
+ |
65
+ row1cell1
66
+ |
67
+ row1cell2
68
+ |
69
+ row1cell3
70
+ |-
71
+ |
72
+ row2cell1
73
+ |
74
+ row2cell2
75
+ |
76
+ row2cell3
77
+ |}
78
+ TXT
79
+
80
+ assert_equal 3, table.size ## three rows
81
+ assert_equal ['header1', 'header2', 'header3'], table[0]
82
+ assert_equal ['row1cell1', 'row1cell2', 'row1cell3'], table[1]
83
+ assert_equal ['row2cell1', 'row2cell2', 'row2cell3'], table[2]
84
+ end
85
+
86
+
57
87
  def test_strip_attributes_and_emphases
58
- tables = Wikiscript::TableReader.parse( <<TXT )
88
+ table = Wikiscript.parse_table( <<TXT )
59
89
  {|
60
90
  |-
61
91
  ! style="width:200px;"|Club
@@ -69,8 +99,6 @@ TXT
69
99
  |}
70
100
  TXT
71
101
 
72
- table = tables[0]
73
- assert_equal 1, tables.size ## one table
74
102
  assert_equal 4, table.size ## four rows
75
103
  assert_equal ['Club', 'City'], table[0]
76
104
  assert_equal ['[[Biu Chun Rangers]]', '[[Sham Shui Po]]'], table[1]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wikiscript
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-09-20 00:00:00.000000000 Z
11
+ date: 2019-09-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: logutils