wiki-yggdrasil 0.1.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.ruby-version +1 -0
- data/README.md +25 -12
- data/lib/wiki/article.rb +26 -7
- data/lib/wiki/yggdrasil.rb +22 -12
- data/lib/wiki/yggdrasil/version.rb +1 -1
- data/wiki-yggdrasil.gemspec +1 -3
- metadata +6 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c9987a3fa945f2a041c202f1f46b19855885cb76cf9b19b5b853eb63f401d100
|
4
|
+
data.tar.gz: 41a39a82cb8bbc5c4bea0aecc96fefeedc3ce671799653f8018b3d55d138abe4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f3faa407a997c9ad97581426782ea00b43ccf43fdcfc245ed219035048610c37f9a16e02f23694db8b7ef3cb48f54f709251cc6331649e2ffe07d089d169629
|
7
|
+
data.tar.gz: 59a856c97b0284194c5a7abbdb52e725dc0b8b1550d02bb05b30b7ebc367b57fa31ddf9fccd714fdc905e02bd03b8b107cf2d230c6a39a925f8e38e2fb35d418
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.6.1
|
data/README.md
CHANGED
@@ -1,10 +1,6 @@
|
|
1
1
|
# Wiki::Yggdrasil
|
2
2
|
![Travis CI Build](https://travis-ci.org/alex0112/wiki-yggdrasil.svg?branch=master)
|
3
3
|
|
4
|
-
You. You're up late at night again reading up on some obscure mathematical topic. You find yourself with *so many* open tabs on Wikipedia. Wouldn't it be nice if you could just pick an article, and then view a tree of the articles it references?
|
5
|
-
|
6
|
-
Introducing Wiki::Yggdrasil. Named after the tree in Norse mythology that drinks from the well of all wisdom, Wiki::Yggdrasil is here to help you drink just as deeply from the well of wisdom that is Wikipedia.
|
7
|
-
|
8
4
|
Wiki::Yggdrasil takes a Wikipedia URI as an argument, and proceeds to spider out a dependency tree of referenced articles.
|
9
5
|
|
10
6
|
## Usage
|
@@ -12,16 +8,33 @@ Wiki::Yggdrasil takes a Wikipedia URI as an argument, and proceeds to spider out
|
|
12
8
|
require 'wiki/yggdrasil'
|
13
9
|
|
14
10
|
@tree = Wiki::Yggdrasil::Yggdrasil.new(uri: 'http://en.wikipedia.org/wiki/Yggdrasil')
|
15
|
-
referenced_articles = @tree.children(depth:
|
11
|
+
referenced_articles = @tree.children(depth: 1) ## A hash of of articles linked by the parent
|
12
|
+
```
|
13
|
+
The preceeding code produces a structure as follows:
|
14
|
+
```ruby
|
15
|
+
{:name=>"Help:IPA/English", :children=>[]}
|
16
|
+
{:name=>"Help:IPA/English", :children=>[]}
|
17
|
+
{:name=>"Old Norse", :children=>[]}
|
18
|
+
{:name=>"Help:IPA", :children=>[]}
|
19
|
+
{:name=>"Trees in mythology", :children=>[]}
|
20
|
+
{:name=>"Norse cosmology", :children=>[]}
|
21
|
+
{:name=>"<i>Poetic Edda</i>", :children=>[]}
|
22
|
+
{:name=>"<i>Prose Edda</i>", :children=>[]}
|
23
|
+
{:name=>"Snorri Sturluson", :children=>[]}
|
24
|
+
{:name=>"<i>Fraxinus excelsior</i>", :children=>[]}
|
25
|
+
{:name=>"Æsir", :children=>[]}
|
26
|
+
{:name=>"Thing (assembly)", :children=>[]}
|
27
|
+
{:name=>"Urðarbrunnr", :children=>[]}
|
28
|
+
{:name=>"Hvergelmir", :children=>[]}
|
29
|
+
{:name=>"Mímisbrunnr", :children=>[]}
|
30
|
+
{:name=>"Níðhöggr", :children=>[]}
|
31
|
+
{:name=>"Veðrfölnir and eagle", :children=>[]}
|
32
|
+
{:name=>"Dáinn, Dvalinn, Duneyrr and Duraþrór", :children=>[]}
|
33
|
+
{:name=>"Sacred trees and groves in Germanic paganism and mythology", :children=>[]}
|
34
|
+
{:name=>"Ragnarök", :children=>[]}
|
16
35
|
```
|
17
36
|
|
18
|
-
|
19
|
-
|
20
|
-
### This is taking a long time. Is that normal?
|
21
|
-
Yes. This is normal. Any Yggdrasil object created with a depth of three or higher will likely take a few minutes to scrape the necessary information.
|
22
|
-
|
23
|
-
### Why didn't you just use Wikipedia's API?
|
24
|
-
Wikipedia's API doesn't have an endpoint that allows you to programatically view the summary section of each article and its children. If it did that would obviously be the ideal choice.
|
37
|
+
_Note: Any Yggdrasil object created with a depth of three or higher will likely take a few minutes to scrape the necessary information._
|
25
38
|
|
26
39
|
## Installation
|
27
40
|
|
data/lib/wiki/article.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'open-uri'
|
3
|
+
|
3
4
|
module Wiki::Yggdrasil
|
4
5
|
|
5
6
|
class Article
|
@@ -10,34 +11,52 @@ module Wiki::Yggdrasil
|
|
10
11
|
@uri = uri
|
11
12
|
@summary = nil
|
12
13
|
@child_links = nil
|
14
|
+
@name = nil
|
15
|
+
@checksum = nil
|
13
16
|
end
|
14
|
-
|
17
|
+
|
15
18
|
def summary
|
16
19
|
@summary ||= Nokogiri::HTML(Nokogiri::HTML(open(self.uri)).to_s.split('<div id="toc" class="toc">')[0]).css('p') ## TODO: Cleanup
|
17
20
|
end
|
18
21
|
|
19
|
-
def
|
22
|
+
def checksum
|
23
|
+
Digest::MD5.hexdigest(@summary.to_s)
|
24
|
+
end
|
25
|
+
|
26
|
+
def child_links(help: false)
|
20
27
|
formatted_links = format_links
|
21
|
-
validated_links =
|
28
|
+
validated_links = formatted_links.select { |uri| Wiki::Yggdrasil::Article.is_valid_wiki_article?(uri: uri) }
|
29
|
+
|
22
30
|
@child_links ||= validated_links
|
23
31
|
end
|
24
32
|
|
25
|
-
def
|
26
|
-
self.summary.css('p a')
|
33
|
+
def scrape_links(help_links: false) ## TODO test help_links param in spec
|
34
|
+
help_links ? self.summary.css('p a') : self.summary.css('p a[href!="/wiki/Help:IPA/English"]')
|
27
35
|
end
|
28
36
|
|
29
|
-
def
|
37
|
+
def name
|
38
|
+
@name ||= Nokogiri::HTML(open(self.uri)).css('#firstHeading').inner_html
|
39
|
+
## TODO: Cleanup
|
40
|
+
end
|
41
|
+
|
42
|
+
def format_links(anchors: self.scrape_links)
|
30
43
|
uris = anchors.map do |anchor|
|
31
44
|
anchor.nil? || anchor['href'].nil? ? next : 'https://en.wikipedia.org' << anchor['href'] ## nil href attributes are often self refs (but possibly not always). Ignore them.
|
45
|
+
## TODO: take care of this in .scrape_links with a css selector (like the Help:IPA links)
|
32
46
|
end
|
33
47
|
|
34
48
|
uris.compact
|
35
49
|
end
|
50
|
+
|
51
|
+
def self.remove_italic_tags(uri_list)
|
52
|
+
|
53
|
+
end
|
36
54
|
|
37
55
|
def self.is_valid_wiki_article?(uri:)
|
38
|
-
## Is this URI a wikipedia article?
|
39
56
|
uri =~ /.*wikipedia\.org\/wiki\/.+/ ? true : false
|
40
57
|
end
|
41
58
|
|
42
59
|
end
|
60
|
+
|
61
|
+
|
43
62
|
end
|
data/lib/wiki/yggdrasil.rb
CHANGED
@@ -13,21 +13,31 @@ module Wiki
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def children(depth: 4, article_children: self.root.child_links)
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
16
|
+
@children ||= { name: self.root.name, children: recursive_scrape(depth: depth), index: 0, depth: 0 }
|
17
|
+
end
|
18
|
+
|
19
|
+
def recursive_scrape(depth: 1, children: @root.child_links)
|
20
|
+
children.each_with_index.map do |uri, index|
|
21
|
+
article = Wiki::Yggdrasil::Article.new(uri: uri)
|
22
|
+
if (depth == 1)
|
23
|
+
{
|
24
|
+
name: article.name,
|
25
|
+
index: index + 1,
|
26
|
+
level: depth,
|
27
|
+
children: [],
|
28
|
+
}
|
29
|
+
else
|
30
|
+
{
|
31
|
+
name: article.name,
|
32
|
+
index: index + 1,
|
33
|
+
level: depth,
|
34
|
+
children: recursive_scrape(depth - 1, article.child_links),
|
35
|
+
}
|
25
36
|
end
|
26
37
|
end
|
27
|
-
|
28
|
-
@children ||= get_children.call(depth, article_children)
|
29
38
|
end
|
30
|
-
|
31
39
|
end
|
40
|
+
|
32
41
|
end
|
42
|
+
|
33
43
|
end
|
data/wiki-yggdrasil.gemspec
CHANGED
@@ -30,9 +30,7 @@ Gem::Specification.new do |spec|
|
|
30
30
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
31
31
|
spec.require_paths = ["lib"]
|
32
32
|
|
33
|
-
spec.add_runtime_dependency "nokogiri", "~> 1.
|
34
|
-
|
35
|
-
spec.add_development_dependency "bundler", "~> 1.16"
|
33
|
+
spec.add_runtime_dependency "nokogiri", "~> 1.10"
|
36
34
|
spec.add_development_dependency "rake", "~> 10.0"
|
37
35
|
spec.add_development_dependency "rspec", "~> 3.0"
|
38
36
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wiki-yggdrasil
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- alex0112
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-09-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -16,28 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 1.
|
19
|
+
version: '1.10'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 1.
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: bundler
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - "~>"
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '1.16'
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - "~>"
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '1.16'
|
26
|
+
version: '1.10'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: rake
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -76,6 +62,7 @@ extra_rdoc_files: []
|
|
76
62
|
files:
|
77
63
|
- ".gitignore"
|
78
64
|
- ".rspec"
|
65
|
+
- ".ruby-version"
|
79
66
|
- ".travis.yml"
|
80
67
|
- Gemfile
|
81
68
|
- LICENSE.txt
|
@@ -107,8 +94,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
94
|
- !ruby/object:Gem::Version
|
108
95
|
version: '0'
|
109
96
|
requirements: []
|
110
|
-
|
111
|
-
rubygems_version: 2.7.4
|
97
|
+
rubygems_version: 3.0.3
|
112
98
|
signing_key:
|
113
99
|
specification_version: 4
|
114
100
|
summary: Scrape Wikipedia articles and generate a json tree
|