wiki-yggdrasil 0.1.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -0
- data/README.md +25 -12
- data/lib/wiki/article.rb +26 -7
- data/lib/wiki/yggdrasil.rb +22 -12
- data/lib/wiki/yggdrasil/version.rb +1 -1
- data/wiki-yggdrasil.gemspec +1 -3
- metadata +6 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c9987a3fa945f2a041c202f1f46b19855885cb76cf9b19b5b853eb63f401d100
|
4
|
+
data.tar.gz: 41a39a82cb8bbc5c4bea0aecc96fefeedc3ce671799653f8018b3d55d138abe4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f3faa407a997c9ad97581426782ea00b43ccf43fdcfc245ed219035048610c37f9a16e02f23694db8b7ef3cb48f54f709251cc6331649e2ffe07d089d169629
|
7
|
+
data.tar.gz: 59a856c97b0284194c5a7abbdb52e725dc0b8b1550d02bb05b30b7ebc367b57fa31ddf9fccd714fdc905e02bd03b8b107cf2d230c6a39a925f8e38e2fb35d418
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.6.1
|
data/README.md
CHANGED
@@ -1,10 +1,6 @@
|
|
1
1
|
# Wiki::Yggdrasil
|
2
2
|

|
3
3
|
|
4
|
-
You. You're up late at night again reading up on some obscure mathematical topic. You find yourself with *so many* open tabs on Wikipedia. Wouldn't it be nice if you could just pick an article, and then view a tree of the articles it references?
|
5
|
-
|
6
|
-
Introducing Wiki::Yggdrasil. Named after the tree in Norse mythology that drinks from the well of all wisdom, Wiki::Yggdrasil is here to help you drink just as deeply from the well of wisdom that is Wikipedia.
|
7
|
-
|
8
4
|
Wiki::Yggdrasil takes a Wikipedia URI as an argument, and proceeds to spider out a dependency tree of referenced articles.
|
9
5
|
|
10
6
|
## Usage
|
@@ -12,16 +8,33 @@ Wiki::Yggdrasil takes a Wikipedia URI as an argument, and proceeds to spider out
|
|
12
8
|
require 'wiki/yggdrasil'
|
13
9
|
|
14
10
|
@tree = Wiki::Yggdrasil::Yggdrasil.new(uri: 'http://en.wikipedia.org/wiki/Yggdrasil')
|
15
|
-
referenced_articles = @tree.children(depth:
|
11
|
+
referenced_articles = @tree.children(depth: 1) ## A hash of of articles linked by the parent
|
12
|
+
```
|
13
|
+
The preceeding code produces a structure as follows:
|
14
|
+
```ruby
|
15
|
+
{:name=>"Help:IPA/English", :children=>[]}
|
16
|
+
{:name=>"Help:IPA/English", :children=>[]}
|
17
|
+
{:name=>"Old Norse", :children=>[]}
|
18
|
+
{:name=>"Help:IPA", :children=>[]}
|
19
|
+
{:name=>"Trees in mythology", :children=>[]}
|
20
|
+
{:name=>"Norse cosmology", :children=>[]}
|
21
|
+
{:name=>"<i>Poetic Edda</i>", :children=>[]}
|
22
|
+
{:name=>"<i>Prose Edda</i>", :children=>[]}
|
23
|
+
{:name=>"Snorri Sturluson", :children=>[]}
|
24
|
+
{:name=>"<i>Fraxinus excelsior</i>", :children=>[]}
|
25
|
+
{:name=>"Æsir", :children=>[]}
|
26
|
+
{:name=>"Thing (assembly)", :children=>[]}
|
27
|
+
{:name=>"Urðarbrunnr", :children=>[]}
|
28
|
+
{:name=>"Hvergelmir", :children=>[]}
|
29
|
+
{:name=>"Mímisbrunnr", :children=>[]}
|
30
|
+
{:name=>"Níðhöggr", :children=>[]}
|
31
|
+
{:name=>"Veðrfölnir and eagle", :children=>[]}
|
32
|
+
{:name=>"Dáinn, Dvalinn, Duneyrr and Duraþrór", :children=>[]}
|
33
|
+
{:name=>"Sacred trees and groves in Germanic paganism and mythology", :children=>[]}
|
34
|
+
{:name=>"Ragnarök", :children=>[]}
|
16
35
|
```
|
17
36
|
|
18
|
-
|
19
|
-
|
20
|
-
### This is taking a long time. Is that normal?
|
21
|
-
Yes. This is normal. Any Yggdrasil object created with a depth of three or higher will likely take a few minutes to scrape the necessary information.
|
22
|
-
|
23
|
-
### Why didn't you just use Wikipedia's API?
|
24
|
-
Wikipedia's API doesn't have an endpoint that allows you to programatically view the summary section of each article and its children. If it did that would obviously be the ideal choice.
|
37
|
+
_Note: Any Yggdrasil object created with a depth of three or higher will likely take a few minutes to scrape the necessary information._
|
25
38
|
|
26
39
|
## Installation
|
27
40
|
|
data/lib/wiki/article.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'open-uri'
|
3
|
+
|
3
4
|
module Wiki::Yggdrasil
|
4
5
|
|
5
6
|
class Article
|
@@ -10,34 +11,52 @@ module Wiki::Yggdrasil
|
|
10
11
|
@uri = uri
|
11
12
|
@summary = nil
|
12
13
|
@child_links = nil
|
14
|
+
@name = nil
|
15
|
+
@checksum = nil
|
13
16
|
end
|
14
|
-
|
17
|
+
|
15
18
|
def summary
|
16
19
|
@summary ||= Nokogiri::HTML(Nokogiri::HTML(open(self.uri)).to_s.split('<div id="toc" class="toc">')[0]).css('p') ## TODO: Cleanup
|
17
20
|
end
|
18
21
|
|
19
|
-
def
|
22
|
+
def checksum
|
23
|
+
Digest::MD5.hexdigest(@summary.to_s)
|
24
|
+
end
|
25
|
+
|
26
|
+
def child_links(help: false)
|
20
27
|
formatted_links = format_links
|
21
|
-
validated_links =
|
28
|
+
validated_links = formatted_links.select { |uri| Wiki::Yggdrasil::Article.is_valid_wiki_article?(uri: uri) }
|
29
|
+
|
22
30
|
@child_links ||= validated_links
|
23
31
|
end
|
24
32
|
|
25
|
-
def
|
26
|
-
self.summary.css('p a')
|
33
|
+
def scrape_links(help_links: false) ## TODO test help_links param in spec
|
34
|
+
help_links ? self.summary.css('p a') : self.summary.css('p a[href!="/wiki/Help:IPA/English"]')
|
27
35
|
end
|
28
36
|
|
29
|
-
def
|
37
|
+
def name
|
38
|
+
@name ||= Nokogiri::HTML(open(self.uri)).css('#firstHeading').inner_html
|
39
|
+
## TODO: Cleanup
|
40
|
+
end
|
41
|
+
|
42
|
+
def format_links(anchors: self.scrape_links)
|
30
43
|
uris = anchors.map do |anchor|
|
31
44
|
anchor.nil? || anchor['href'].nil? ? next : 'https://en.wikipedia.org' << anchor['href'] ## nil href attributes are often self refs (but possibly not always). Ignore them.
|
45
|
+
## TODO: take care of this in .scrape_links with a css selector (like the Help:IPA links)
|
32
46
|
end
|
33
47
|
|
34
48
|
uris.compact
|
35
49
|
end
|
50
|
+
|
51
|
+
def self.remove_italic_tags(uri_list)
|
52
|
+
|
53
|
+
end
|
36
54
|
|
37
55
|
def self.is_valid_wiki_article?(uri:)
|
38
|
-
## Is this URI a wikipedia article?
|
39
56
|
uri =~ /.*wikipedia\.org\/wiki\/.+/ ? true : false
|
40
57
|
end
|
41
58
|
|
42
59
|
end
|
60
|
+
|
61
|
+
|
43
62
|
end
|
data/lib/wiki/yggdrasil.rb
CHANGED
@@ -13,21 +13,31 @@ module Wiki
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def children(depth: 4, article_children: self.root.child_links)
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
16
|
+
@children ||= { name: self.root.name, children: recursive_scrape(depth: depth), index: 0, depth: 0 }
|
17
|
+
end
|
18
|
+
|
19
|
+
def recursive_scrape(depth: 1, children: @root.child_links)
|
20
|
+
children.each_with_index.map do |uri, index|
|
21
|
+
article = Wiki::Yggdrasil::Article.new(uri: uri)
|
22
|
+
if (depth == 1)
|
23
|
+
{
|
24
|
+
name: article.name,
|
25
|
+
index: index + 1,
|
26
|
+
level: depth,
|
27
|
+
children: [],
|
28
|
+
}
|
29
|
+
else
|
30
|
+
{
|
31
|
+
name: article.name,
|
32
|
+
index: index + 1,
|
33
|
+
level: depth,
|
34
|
+
children: recursive_scrape(depth - 1, article.child_links),
|
35
|
+
}
|
25
36
|
end
|
26
37
|
end
|
27
|
-
|
28
|
-
@children ||= get_children.call(depth, article_children)
|
29
38
|
end
|
30
|
-
|
31
39
|
end
|
40
|
+
|
32
41
|
end
|
42
|
+
|
33
43
|
end
|
data/wiki-yggdrasil.gemspec
CHANGED
@@ -30,9 +30,7 @@ Gem::Specification.new do |spec|
|
|
30
30
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
31
31
|
spec.require_paths = ["lib"]
|
32
32
|
|
33
|
-
spec.add_runtime_dependency "nokogiri", "~> 1.
|
34
|
-
|
35
|
-
spec.add_development_dependency "bundler", "~> 1.16"
|
33
|
+
spec.add_runtime_dependency "nokogiri", "~> 1.10"
|
36
34
|
spec.add_development_dependency "rake", "~> 10.0"
|
37
35
|
spec.add_development_dependency "rspec", "~> 3.0"
|
38
36
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wiki-yggdrasil
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- alex0112
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-09-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -16,28 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 1.
|
19
|
+
version: '1.10'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 1.
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: bundler
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - "~>"
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '1.16'
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - "~>"
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '1.16'
|
26
|
+
version: '1.10'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: rake
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -76,6 +62,7 @@ extra_rdoc_files: []
|
|
76
62
|
files:
|
77
63
|
- ".gitignore"
|
78
64
|
- ".rspec"
|
65
|
+
- ".ruby-version"
|
79
66
|
- ".travis.yml"
|
80
67
|
- Gemfile
|
81
68
|
- LICENSE.txt
|
@@ -107,8 +94,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
94
|
- !ruby/object:Gem::Version
|
108
95
|
version: '0'
|
109
96
|
requirements: []
|
110
|
-
|
111
|
-
rubygems_version: 2.7.4
|
97
|
+
rubygems_version: 3.0.3
|
112
98
|
signing_key:
|
113
99
|
specification_version: 4
|
114
100
|
summary: Scrape Wikipedia articles and generate a json tree
|