wikiscript 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e86e3e1b50b44067a5815a13155b04aa441e8023
4
- data.tar.gz: c38ed306346a9b3d2c7c365fba00c41bee4624b4
3
+ metadata.gz: fe20e2b58b4703144ef8a83d1cb41edf92f5ef3e
4
+ data.tar.gz: ae8a0d31b16099c33fee4ac450b27433852b8fb8
5
5
  SHA512:
6
- metadata.gz: 290c18216f59d7c2e6f2e1b181651413790fd2f7ff709c7f6f9c9cf1708b540480efa8827c57613df6862b1da755cd584c53f71ff93b07fe34a7a2ef4a67420d
7
- data.tar.gz: 9e0c5c402b17263d38065eaac1a9e1febee6f3a73dff382aae8d81a8d0436881a2437a784dcee83f69c50d1fb5c6c5f385458a90ebb2cff8a31d31fffbd951a5
6
+ metadata.gz: 61fe785e9ea9150f6a5711ff0038cb135aba18f719abf26d6783bf50c465678f7376c315bcce3091b325d8ac9ca4714e084f93503e21dae583cddb9c904863d5
7
+ data.tar.gz: c437c92ba5df517d21231060044ab690aba2b1b11a368d37018c65eb493cea467bbd3331f161f217c1764fc24025fcf97a57ab3889e5aa2fc6b53d57e460f13b
data/NOTES.md CHANGED
@@ -3,4 +3,33 @@
3
3
 
4
4
  ## Alternatives
5
5
 
6
- TBD
6
+
7
+
8
+ - [wikipedia-client](https://rubygems.org/gems/wikipedia-client) - by Ken Pratt et al - ruby client for the Wikipedia API
9
+ - <https://github.com/kenpratt/wikipedia-client>
10
+ - <https://www.rubydoc.info/gems/wikipedia-client>
11
+
12
+ <!-- break -->
13
+
14
+ - [infoboxer](https://rubygems.org/gems/infoboxer) - by Victor Shepelev et al - pure-Ruby Wikipedia (and generic MediaWiki) client and parser, targeting information extraction
15
+ - <https://github.com/molybdenum-99/infoboxer>
16
+ - <https://www.rubydoc.info/gems/infoboxer>
17
+
18
+ <!-- break -->
19
+
20
+ More
21
+
22
+ - <https://github.com/molybdenum-99/reality>
23
+ - https://github.com/molybdenum-99/mediawiktory
24
+
25
+ **Python**
26
+
27
+ - <https://pypi.org/project/wptools/> - Wikipedia tools (for Humans)
28
+ - <https://github.com/siznax/wptools/>
29
+
30
+
31
+ ## Wikipedia
32
+
33
+ - Wikipedia API reference: <http://en.wikipedia.org/w/api.php>
34
+
35
+
data/lib/wikiscript.rb CHANGED
@@ -51,11 +51,11 @@ module Wikiscript
51
51
  }x
52
52
 
53
53
 
54
- def self.unlink( value )
54
+ def self.unlink( text )
55
55
  ## replace ALL wiki links with title (or link)
56
56
  ## e.g. [[Santiago]] ([[La Florida, Chile|La Florida]])
57
57
  ## => Santiago (La Florida)
58
- value = value.gsub( LINK_PATTERN ) do |_|
58
+ text = text.gsub( LINK_PATTERN ) do |_|
59
59
  link = $~[:link]
60
60
  title = $~[:title]
61
61
 
@@ -66,14 +66,16 @@ module Wikiscript
66
66
  end
67
67
  end
68
68
 
69
- value.strip
69
+ text.strip
70
+ end
71
+ class << self
72
+ alias_method :flatten_links, :unlink
70
73
  end
71
74
 
72
-
73
- def self.parse_link( value ) ## todo/change: find a better name - use match_link/etc. - why? why not?
75
+ def self.parse_link( text ) ## todo/change: find a better name - use match_link/etc. - why? why not?
74
76
  ## find first matching link
75
77
  ## return [nil,nil] if nothing found
76
- if (m = LINK_PATTERN.match( value ))
78
+ if (m = LINK_PATTERN.match( text ))
77
79
  link = m[:link]
78
80
  title = m[:title]
79
81
 
@@ -85,6 +87,17 @@ module Wikiscript
85
87
  end
86
88
  end
87
89
 
90
+ ############################
91
+ ## more convenience shortcuts / helpers
92
+ def self.parse( text ) PageReader.parse( text ); end
93
+ def self.parse_table( text ) TableReader.parse_table( text ); end
94
+
95
+ def self.get( title, lang: Wikiscript.lang ) Page.get( title, lang: lang ); end
96
+ class << self
97
+ alias_method :fetch, :get
98
+ alias_method :download, :get
99
+ end
100
+
88
101
  end # module Wikiscript
89
102
 
90
103
 
@@ -15,30 +15,30 @@ module Wikiscript
15
15
  @worker = Fetcher::Worker.new
16
16
  end
17
17
 
18
- ## change to: wikitext why? why not? or to raw? why? why not?
19
- def text( title )
18
+ ## change to: wikitext or raw why? why not? or to raw? why? why not?
19
+ def text( title, lang: Wikiscript.lang )
20
20
  ## todo/fix: convert spaces to _ if not present for wikipedia page title - why ?? why not ???
21
- get( action: 'raw', title: title )
22
- end
23
21
 
24
- private
25
- def site_base
26
- ## replace lang w/ lang config if present e.g.
22
+ ## note: replace lang w/ lang config if present e.g.
27
23
  ## http://{lang}.wikipedia.org/w/index.php
28
24
  # becomes
29
25
  # http://en.wikipedia.org/w/index.php or
30
26
  # http://de.wikipedia.org/w/index.php etc
27
+ base_url = SITE_BASE.gsub( "{lang}", lang )
28
+ params = { action: 'raw',
29
+ title: title }
31
30
 
32
- SITE_BASE.gsub( "{lang}", Wikiscript.lang )
31
+ get( base_url, params )
33
32
  end
34
33
 
34
+ private
35
35
  def build_query( h )
36
36
  h.map do |k,v|
37
37
  "#{CGI.escape(k.to_s)}=#{CGI.escape(v.to_s)}"
38
38
  end.join( '&' )
39
39
  end
40
40
 
41
- def get( params )
41
+ def get( base_url, params )
42
42
  # note: lets us passing in params as hash e.g.
43
43
  # action: 'raw', title: 'Austria'
44
44
  # key and values will get CGI escaped
@@ -46,7 +46,7 @@ private
46
46
 
47
47
  ## uri = URI.parse( "#{SITE_BASE}?#{params}" )
48
48
  ## fix: pass in uri (add to fetcher check for is_a? URI etc.)
49
- uri_string = "#{site_base}?#{query}"
49
+ uri_string = "#{base_url}?#{query}"
50
50
 
51
51
  response = @worker.get_response( uri_string )
52
52
 
@@ -69,7 +69,8 @@ private
69
69
  t
70
70
  else
71
71
  logger.error "fetch HTTP - #{response.code} #{response.message}"
72
- nil
72
+ exit 1 ### exit for now on error - why? why not?
73
+ ## nil
73
74
  end
74
75
  end
75
76
 
@@ -6,15 +6,24 @@ module Wikiscript
6
6
 
7
7
  include LogUtils::Logging
8
8
 
9
- attr_reader :title
9
+ attr_reader :title, :lang
10
10
 
11
- def initialize( title, text: nil )
11
+
12
+ def self.get( title, lang: Wikiscript.lang ) ## todo/check: add a fetch/download alias - why? why not?
13
+ o = new( title: title, lang: lang )
14
+ ## o.text ## "force" download / fetch
15
+ o
16
+ end
17
+
18
+
19
+ def initialize( text=nil, title: nil, lang: Wikiscript.lang )
12
20
  ## todo: check title
13
21
  ## replace title spaces w/ _ ????
14
22
  ## to allow "pretty" titles - why? why not??
15
23
 
16
- @title = title
17
24
  @text = text
25
+ @title = title
26
+ @lang = lang
18
27
  end
19
28
 
20
29
  def text
@@ -22,7 +31,7 @@ module Wikiscript
22
31
  end
23
32
 
24
33
  def download_text
25
- Client.new.text( @title )
34
+ Client.new.text( @title, lang: @lang )
26
35
  end
27
36
 
28
37
  def parse ## todo/change: use/find a different name e.g. doc/elements/etc. - why? why not?
@@ -53,9 +53,11 @@ class PageReader
53
53
  elsif inside_table
54
54
  table_txt << line << "\n"
55
55
  else
56
- puts "** !!! ERROR !!! unknown line type in wiki page:"
57
- pp line
58
- exit 1
56
+ ## note: skip unknown line types for now
57
+
58
+ ## puts "** !!! ERROR !!! unknown line type in wiki page:"
59
+ ## pp line
60
+ ## exit 1
59
61
  end
60
62
  end
61
63
  page
@@ -65,21 +65,31 @@ class TableReader
65
65
  row = []
66
66
  rows << row
67
67
  end
68
- ## add each value one-by-one for now (to keep (same) row reference)
69
- ## note: also strip leading (optional) attributes
70
- values.each do |value|
71
- row << strip_emphases( strip_attributes( value.strip ))
68
+ if values.empty?
69
+ ## note: support continuing column text in next line
70
+ row << String.new
71
+ else
72
+ ## add each value one-by-one for now (to keep (same) row reference)
73
+ ## note: also strip leading (optional) attributes
74
+ values.each do |value|
75
+ row << strip_emphases( strip_attributes( value.strip ))
76
+ end
72
77
  end
73
78
  elsif inside_table && line.start_with?( '|' ) ## table data
74
79
  values = line.sub( '|', '' ).strip.split( '||' )
75
- ## add each value one-by-one for now (to keep (same) row reference)
76
- values.each do |value|
77
- row << strip_emphases( strip_attributes( value.strip ))
80
+ if values.empty?
81
+ ## note: support continuing column text in next line
82
+ row << String.new
83
+ else
84
+ ## add each value one-by-one for now (to keep (same) row reference)
85
+ values.each do |value|
86
+ row << strip_emphases( strip_attributes( value.strip ))
87
+ end
78
88
  end
79
89
  elsif inside_table
80
- puts "!! ERROR !! unknown line type inside table:"
81
- puts line
82
- exit 1
90
+ ## note: support continuing column text in next line
91
+ ## todo/check: for now doesn't support multi-line just simple continuing line - fix later if needed!!!
92
+ row[-1] << line
83
93
  else
84
94
  puts "!! ERROR !! unknown line type outside (before or after) table:"
85
95
  puts line
@@ -1,6 +1,6 @@
1
1
 
2
2
  module Wikiscript
3
- VERSION = '0.2.0'
3
+ VERSION = '0.3.0'
4
4
 
5
5
  def self.banner
6
6
  "wikiscript/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
data/test/test_page.rb CHANGED
@@ -1,5 +1,9 @@
1
1
  # encoding: utf-8
2
2
 
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_page.rb
6
+
3
7
 
4
8
  require 'helper'
5
9
 
@@ -11,7 +15,12 @@ class TestPage < MiniTest::Test
11
15
  end
12
16
 
13
17
  def test_austria_en
14
- page = Wikiscript::Page.new( 'Austria' )
18
+ page = Wikiscript::Page.get( 'Austria' )
19
+ # [debug] GET /w/index.php?action=raw&title=Austria uri=http://en.wikipedia.org/w/index.php?action=raw&title=Austria, redirect_limit=5
20
+ # [debug] 301 TLS Redirect location=https://en.wikipedia.org/w/index.php?action=raw&title=Austria
21
+ # [debug] GET /w/index.php?action=raw&title=Austria uri=https://en.wikipedia.org/w/index.php?action=raw&title=Austria, redirect_limit=4
22
+ # [debug] 200 OK
23
+
15
24
  text = page.text
16
25
 
17
26
  ## print first 600 chars
@@ -21,20 +30,25 @@ class TestPage < MiniTest::Test
21
30
  assert /{{Infobox country/ =~ text
22
31
  assert /common_name = Austria/ =~ text
23
32
  assert /capital = \[\[Vienna\]\]/ =~ text
24
- assert /The origins of modern-day Austria date back to the time/ =~ text
33
+ # assert /The origins of modern-day Austria date back to the time/ =~ text
25
34
  end
26
35
 
27
36
  def test_sankt_poelten_en
28
- page = Wikiscript::Page.new( 'Sankt_Pölten' )
37
+ page = Wikiscript::Page.get( 'Sankt_Pölten' )
38
+ # [debug] GET /w/index.php?action=raw&title=Sankt_P%C3%B6lten uri=http://en.wikipedia.org/w/index.php?action=raw&title=Sankt_P%C3%B6lten, redirect_limit=5
39
+ # [debug] 301 TLS Redirect location=https://en.wikipedia.org/w/index.php?action=raw&title=Sankt_P%C3%B6lten
40
+ # [debug] GET /w/index.php?action=raw&title=Sankt_P%C3%B6lten uri=https://en.wikipedia.org/w/index.php?action=raw&title=Sankt_P%C3%B6lten, redirect_limit=4
41
+ # [debug] 200 OK
42
+
29
43
  text = page.text
30
44
 
31
45
  ## print first 600 chars
32
46
  pp text[0..600]
33
47
 
34
48
  ## check for some snippets
35
- assert /{{Infobox Town AT/ =~ text
36
- assert /Name\s+=\s+Sankt Pölten/ =~ text
37
- assert /'''Sankt Pölten''' \(''St. Pölten''\) is the capital city of/ =~ text
49
+ assert /{{Infobox settlement/ =~ text
50
+ assert /name\s+=\s+Sankt Pölten/ =~ text
51
+ # assert /'''Sankt Pölten''' \(''St. Pölten''\) is the capital city of/ =~ text
38
52
  end
39
53
 
40
54
  end # class TestPage
data/test/test_page_de.rb CHANGED
@@ -1,6 +1,11 @@
1
1
  # encoding: utf-8
2
2
 
3
3
 
4
+ ###
5
+ # to run use
6
+ # ruby -I ./lib -I ./test test/test_page_de.rb
7
+
8
+
4
9
  require 'helper'
5
10
 
6
11
 
@@ -11,7 +16,13 @@ class TestPageDe < MiniTest::Test
11
16
  end
12
17
 
13
18
  def test_st_poelten_de
14
- page = Wikiscript::Page.new( 'St._Pölten' )
19
+ page = Wikiscript::Page.get( 'St._Pölten' )
20
+ # [debug] GET /w/index.php?action=raw&title=St._P%C3%B6lten uri=http://de.wikipedia.org/w/index.php?action=raw&title=St._P%C3%B6lten, redirect_limit=5
21
+ # [debug] 301 TLS Redirect location=https://de.wikipedia.org/w/index.php?action=raw&title=St._P%C3%B6lten
22
+ # [debug] GET /w/index.php?action=raw&title=St._P%C3%B6lten uri=https://de.wikipedia.org/w/index.php?action=raw&title=St._P%C3%B6lten, redirect_limit=4
23
+ # [debug] 200 OK
24
+
25
+
15
26
  text = page.text
16
27
 
17
28
  ## print first 600 chars
@@ -11,7 +11,7 @@ require 'helper'
11
11
  class TestPageReader < MiniTest::Test
12
12
 
13
13
  def test_basic
14
- el = Wikiscript::PageReader.parse( <<TXT )
14
+ el = Wikiscript.parse( <<TXT )
15
15
  =Heading 1==
16
16
  ==Heading 2==
17
17
  ===Heading 3===
@@ -44,7 +44,7 @@ TXT
44
44
  end
45
45
 
46
46
  def test_parse
47
- page = Wikiscript::Page.new( 'Test', text: <<TXT )
47
+ page = Wikiscript::Page.new( <<TXT )
48
48
  =Heading 1==
49
49
  ==Heading 2==
50
50
  ===Heading 3===
@@ -10,7 +10,7 @@ require 'helper'
10
10
  class TestTableReader < MiniTest::Test
11
11
 
12
12
  def test_basic
13
- tables = Wikiscript::TableReader.parse( <<TXT )
13
+ table = Wikiscript.parse_table( <<TXT )
14
14
  {|
15
15
  |-
16
16
  ! header1
@@ -27,8 +27,6 @@ class TestTableReader < MiniTest::Test
27
27
  |}
28
28
  TXT
29
29
 
30
- table = tables[0]
31
- assert_equal 1, tables.size ## one table
32
30
  assert_equal 3, table.size ## three rows
33
31
  assert_equal ['header1', 'header2', 'header3'], table[0]
34
32
  assert_equal ['row1cell1', 'row1cell2', 'row1cell3'], table[1]
@@ -36,7 +34,7 @@ TXT
36
34
  end
37
35
 
38
36
  def test_basic_ii ## with optional (missing) row divider before headers
39
- tables = Wikiscript::TableReader.parse( <<TXT )
37
+ table = Wikiscript.parse_table( <<TXT )
40
38
  {|
41
39
  ! header1 !! header2 !! header3
42
40
  |-
@@ -46,16 +44,48 @@ TXT
46
44
  |}
47
45
  TXT
48
46
 
49
- table = tables[0]
50
- assert_equal 1, tables.size ## one table
51
47
  assert_equal 3, table.size ## three rows
52
48
  assert_equal ['header1', 'header2', 'header3'], table[0]
53
49
  assert_equal ['row1cell1', 'row1cell2', 'row1cell3'], table[1]
54
50
  assert_equal ['row2cell1', 'row2cell2', 'row2cell3'], table[2]
55
51
  end
56
52
 
53
+ def test_basic_iii # with continuing header column lines
54
+ table = Wikiscript.parse_table( <<TXT )
55
+ {|
56
+ |-
57
+ !
58
+ header1
59
+ !
60
+ header2
61
+ !
62
+ header3
63
+ |-
64
+ |
65
+ row1cell1
66
+ |
67
+ row1cell2
68
+ |
69
+ row1cell3
70
+ |-
71
+ |
72
+ row2cell1
73
+ |
74
+ row2cell2
75
+ |
76
+ row2cell3
77
+ |}
78
+ TXT
79
+
80
+ assert_equal 3, table.size ## three rows
81
+ assert_equal ['header1', 'header2', 'header3'], table[0]
82
+ assert_equal ['row1cell1', 'row1cell2', 'row1cell3'], table[1]
83
+ assert_equal ['row2cell1', 'row2cell2', 'row2cell3'], table[2]
84
+ end
85
+
86
+
57
87
  def test_strip_attributes_and_emphases
58
- tables = Wikiscript::TableReader.parse( <<TXT )
88
+ table = Wikiscript.parse_table( <<TXT )
59
89
  {|
60
90
  |-
61
91
  ! style="width:200px;"|Club
@@ -69,8 +99,6 @@ TXT
69
99
  |}
70
100
  TXT
71
101
 
72
- table = tables[0]
73
- assert_equal 1, tables.size ## one table
74
102
  assert_equal 4, table.size ## four rows
75
103
  assert_equal ['Club', 'City'], table[0]
76
104
  assert_equal ['[[Biu Chun Rangers]]', '[[Sham Shui Po]]'], table[1]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wikiscript
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-09-20 00:00:00.000000000 Z
11
+ date: 2019-09-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: logutils