openstax_content 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +661 -0
- data/README.md +2 -0
- data/lib/openstax/content.rb +14 -0
- data/lib/openstax/content/abl.rb +13 -0
- data/lib/openstax/content/archive.rb +104 -0
- data/lib/openstax/content/book.rb +70 -0
- data/lib/openstax/content/book_part.rb +47 -0
- data/lib/openstax/content/custom_css.rb +9 -0
- data/lib/openstax/content/fragment.rb +19 -0
- data/lib/openstax/content/fragment/embedded.rb +67 -0
- data/lib/openstax/content/fragment/exercise.rb +56 -0
- data/lib/openstax/content/fragment/html.rb +62 -0
- data/lib/openstax/content/fragment/interactive.rb +36 -0
- data/lib/openstax/content/fragment/optional_exercise.rb +4 -0
- data/lib/openstax/content/fragment/reading.rb +9 -0
- data/lib/openstax/content/fragment/video.rb +8 -0
- data/lib/openstax/content/fragment_splitter.rb +193 -0
- data/lib/openstax/content/page.rb +201 -0
- data/lib/openstax/content/s3.rb +44 -0
- data/lib/openstax/content/title.rb +18 -0
- data/lib/openstax/content/version.rb +5 -0
- metadata +162 -0
data/README.md
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
module OpenStax
|
2
|
+
module Content
|
3
|
+
class << self
|
4
|
+
attr_accessor :abl_url, :archive_path, :bucket_name, :domain, :exercises_search_api_url,
|
5
|
+
:logger, :s3_region, :s3_access_key_id, :s3_secret_access_key
|
6
|
+
|
7
|
+
def configure
|
8
|
+
yield self
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
Dir["#{__dir__}/content/**/*.rb"].each { |file| require file }
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class OpenStax::Content::Abl
|
2
|
+
def body
|
3
|
+
@body ||= JSON.parse(Faraday.get(OpenStax::Content.abl_url).body).deep_symbolize_keys
|
4
|
+
end
|
5
|
+
|
6
|
+
def approved_books
|
7
|
+
body[:approved_books]
|
8
|
+
end
|
9
|
+
|
10
|
+
def approved_versions
|
11
|
+
body[:approved_versions]
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
require 'faraday'
|
3
|
+
|
4
|
+
class OpenStax::Content::Archive
|
5
|
+
def initialize(version)
|
6
|
+
@version = version
|
7
|
+
@slugs = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def base_url
|
11
|
+
@base_url ||= "https://#{OpenStax::Content.domain}/#{
|
12
|
+
OpenStax::Content.archive_path}/#{@version}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def url_for(object)
|
16
|
+
return if object.nil?
|
17
|
+
|
18
|
+
begin
|
19
|
+
uri = Addressable::URI.parse object
|
20
|
+
rescue Addressable::URI::InvalidURIError
|
21
|
+
begin
|
22
|
+
uri = Addressable::URI.parse "/#{object}"
|
23
|
+
rescue Addressable::URI::InvalidURIError
|
24
|
+
OpenStax::Content.logger.warn { "Invalid url: \"#{object}\" in archive link" }
|
25
|
+
|
26
|
+
return object
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
if uri.absolute?
|
31
|
+
OpenStax::Content.logger.warn do
|
32
|
+
"#{self.class.name} received an unexpected absolute URL in url_for: \"#{object}\""
|
33
|
+
end
|
34
|
+
|
35
|
+
# Force absolute URLs to be https
|
36
|
+
uri.scheme = 'https'
|
37
|
+
return uri.to_s
|
38
|
+
end
|
39
|
+
|
40
|
+
if uri.path.empty?
|
41
|
+
OpenStax::Content.logger.warn do
|
42
|
+
"#{self.class.name} received an unexpected fragment-only URL in url_for: \"#{object}\""
|
43
|
+
end
|
44
|
+
|
45
|
+
return object
|
46
|
+
end
|
47
|
+
|
48
|
+
if uri.path.start_with?('../')
|
49
|
+
uri.path = uri.path.sub('..', '')
|
50
|
+
"#{base_url}#{uri.to_s}"
|
51
|
+
else
|
52
|
+
uri.path = "#{uri.path.chomp('.json').chomp('.xhtml')}.json"
|
53
|
+
|
54
|
+
uri.path.start_with?('/') ? "#{base_url}/contents#{uri.to_s}" :
|
55
|
+
"#{base_url}/contents/#{uri.to_s}"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def fetch(object)
|
60
|
+
url = url_for object
|
61
|
+
OpenStax::Content.logger.debug { "Fetching #{url}" }
|
62
|
+
Faraday.get(url).body
|
63
|
+
end
|
64
|
+
|
65
|
+
def json(object)
|
66
|
+
begin
|
67
|
+
JSON.parse(fetch(object)).tap do |hash|
|
68
|
+
@slugs[object] = hash['slug']
|
69
|
+
end
|
70
|
+
rescue JSON::ParserError => err
|
71
|
+
raise "OpenStax Content Archive returned invalid JSON for #{url_for object}: #{err.message}"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def s3
|
76
|
+
@s3 ||= OpenStax::Content::S3.new
|
77
|
+
end
|
78
|
+
|
79
|
+
def add_latest_book_version_if_missing(object)
|
80
|
+
book_id, page_id = object.split(':', 2)
|
81
|
+
book_uuid, book_version = book_id.split('@', 2)
|
82
|
+
return object unless book_version.nil? && s3.bucket_configured?
|
83
|
+
|
84
|
+
s3.ls(@version).each do |book|
|
85
|
+
uuid, version = book.split('@')
|
86
|
+
next unless uuid == book_uuid
|
87
|
+
|
88
|
+
book_version = version
|
89
|
+
break
|
90
|
+
end
|
91
|
+
|
92
|
+
book_id = "#{book_uuid}@#{book_version}".chomp('@')
|
93
|
+
"#{book_id}:#{page_id}".chomp(':')
|
94
|
+
end
|
95
|
+
|
96
|
+
def slug(object)
|
97
|
+
@slugs[object] ||= begin
|
98
|
+
object_with_version = add_latest_book_version_if_missing object
|
99
|
+
slug = json(object_with_version)['slug']
|
100
|
+
@slugs[object_with_version] = slug if object_with_version != object
|
101
|
+
slug
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require_relative 'archive'
|
2
|
+
require_relative 'book_part'
|
3
|
+
|
4
|
+
class OpenStax::Content::Book
|
5
|
+
def initialize(
|
6
|
+
archive_version:, uuid: nil, version: nil, hash: nil, title: nil, tree: nil, root_book_part: nil
|
7
|
+
)
|
8
|
+
@uuid = uuid || (hash || {})['id']
|
9
|
+
raise ArgumentError, 'Either uuid or hash with id key is required' if @uuid.nil?
|
10
|
+
|
11
|
+
@version = version || (hash || {})['version']
|
12
|
+
raise ArgumentError, 'Either version or hash with version key is required' if @version.nil?
|
13
|
+
|
14
|
+
@archive_version = archive_version
|
15
|
+
@hash = hash
|
16
|
+
@title = title
|
17
|
+
@tree = tree
|
18
|
+
@root_book_part = root_book_part
|
19
|
+
end
|
20
|
+
|
21
|
+
attr_reader :archive_version, :uuid, :version
|
22
|
+
|
23
|
+
def archive
|
24
|
+
@archive ||= OpenStax::Content::Archive.new archive_version
|
25
|
+
end
|
26
|
+
|
27
|
+
def url
|
28
|
+
@url ||= archive.url_for "#{uuid}@#{version}"
|
29
|
+
end
|
30
|
+
|
31
|
+
def url_fragment
|
32
|
+
@url_fragment ||= url.chomp('.json')
|
33
|
+
end
|
34
|
+
|
35
|
+
def baked
|
36
|
+
@baked ||= hash['baked']
|
37
|
+
end
|
38
|
+
|
39
|
+
def collated
|
40
|
+
@collated ||= hash.fetch('collated', false)
|
41
|
+
end
|
42
|
+
|
43
|
+
def hash
|
44
|
+
@hash ||= archive.json url
|
45
|
+
end
|
46
|
+
|
47
|
+
def uuid
|
48
|
+
@uuid ||= hash.fetch('id')
|
49
|
+
end
|
50
|
+
|
51
|
+
def short_id
|
52
|
+
@short_id ||= hash['shortId']
|
53
|
+
end
|
54
|
+
|
55
|
+
def version
|
56
|
+
@version ||= hash.fetch('version')
|
57
|
+
end
|
58
|
+
|
59
|
+
def title
|
60
|
+
@title ||= hash.fetch('title')
|
61
|
+
end
|
62
|
+
|
63
|
+
def tree
|
64
|
+
@tree ||= hash.fetch('tree')
|
65
|
+
end
|
66
|
+
|
67
|
+
def root_book_part
|
68
|
+
@root_book_part ||= OpenStax::Content::BookPart.new(hash: tree, is_root: true, book: self)
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require_relative 'title'
|
2
|
+
require_relative 'page'
|
3
|
+
|
4
|
+
class OpenStax::Content::BookPart
|
5
|
+
def initialize(hash: {}, is_root: false, book: nil)
|
6
|
+
@hash = hash
|
7
|
+
@is_root = is_root
|
8
|
+
@book = book
|
9
|
+
end
|
10
|
+
|
11
|
+
attr_reader :hash, :is_root, :book
|
12
|
+
|
13
|
+
def parsed_title
|
14
|
+
@parsed_title ||= OpenStax::Content::Title.new hash.fetch('title')
|
15
|
+
end
|
16
|
+
|
17
|
+
def book_location
|
18
|
+
@book_location ||= parsed_title.book_location
|
19
|
+
end
|
20
|
+
|
21
|
+
def title
|
22
|
+
@title ||= parsed_title.text
|
23
|
+
end
|
24
|
+
|
25
|
+
# Old content used to have id == "subcol" for units and chapters
|
26
|
+
# If we encounter that, just assign a random UUID to them
|
27
|
+
def uuid
|
28
|
+
@uuid ||= begin
|
29
|
+
uuid = hash['id']
|
30
|
+
uuid.nil? || uuid == 'subcol' ? SecureRandom.uuid : uuid.split('@').first
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def contents
|
35
|
+
@contents ||= hash.fetch('contents')
|
36
|
+
end
|
37
|
+
|
38
|
+
def parts
|
39
|
+
@parts ||= contents.map do |hash|
|
40
|
+
if hash.has_key? 'contents'
|
41
|
+
self.class.new book: book, hash: hash
|
42
|
+
else
|
43
|
+
OpenStax::Content::Page.new book: book, hash: hash
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# All fragment subclasses must be serializable with to_yaml and YAML.load
|
2
|
+
# Nokogiri nodes are not serializable, so they must be processed in the initialize method
|
3
|
+
class OpenStax::Content::Fragment
|
4
|
+
attr_reader :title, :labels, :node_id
|
5
|
+
|
6
|
+
def initialize(node:, title: nil, labels: nil)
|
7
|
+
@title = title
|
8
|
+
@labels = labels || []
|
9
|
+
@node_id = node[:id]
|
10
|
+
end
|
11
|
+
|
12
|
+
def blank?
|
13
|
+
false
|
14
|
+
end
|
15
|
+
|
16
|
+
def html?
|
17
|
+
false
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require_relative 'html'
|
2
|
+
|
3
|
+
class OpenStax::Content::Fragment::Embedded < OpenStax::Content::Fragment::Html
|
4
|
+
# Used to get the title
|
5
|
+
TITLE_CSS = '[data-type="title"]'
|
6
|
+
|
7
|
+
# Used to get the title if there are no title nodes
|
8
|
+
LABEL_ATTRIBUTE = '[data-label]'
|
9
|
+
|
10
|
+
# CSS to find embedded content urls
|
11
|
+
TAGGED_URL_CSS = 'iframe.os-embed, a.os-embed, .os-embed iframe, .os-embed a'
|
12
|
+
UNTAGGED_URL_CSS = 'iframe, a'
|
13
|
+
|
14
|
+
CLASS_ATTRIBUTES = [ :iframe_classes, :iframe_title, :default_width, :default_height ]
|
15
|
+
|
16
|
+
class << self
|
17
|
+
attr_accessor *CLASS_ATTRIBUTES
|
18
|
+
|
19
|
+
def inherited(subclass)
|
20
|
+
CLASS_ATTRIBUTES.each do |class_attribute|
|
21
|
+
subclass.send "#{class_attribute}=", send(class_attribute)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
self.iframe_classes = ['os-embed']
|
27
|
+
self.iframe_title = ''
|
28
|
+
|
29
|
+
CLASS_ATTRIBUTES.each do |class_attribute|
|
30
|
+
define_method(class_attribute) { self.class.send class_attribute }
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_reader :url, :width, :height
|
34
|
+
|
35
|
+
def initialize(node:, title: nil, labels: nil)
|
36
|
+
super
|
37
|
+
|
38
|
+
@title ||= begin
|
39
|
+
title_nodes = @node.css(TITLE_CSS)
|
40
|
+
titles = title_nodes.empty? ? @node.css(LABEL_ATTRIBUTE).map do |label|
|
41
|
+
label.attr('data-label')
|
42
|
+
end : title_nodes.map { |node| node.content.strip }
|
43
|
+
titles.uniq.join('; ')
|
44
|
+
end
|
45
|
+
|
46
|
+
url_node = @node.at_css(TAGGED_URL_CSS) || @node.css(UNTAGGED_URL_CSS).last
|
47
|
+
|
48
|
+
@width = url_node&.[]('width') || default_width
|
49
|
+
@height = url_node&.[]('height') || default_height
|
50
|
+
@url = url_node&.[]('src') || url_node&.[]('href')
|
51
|
+
|
52
|
+
if url_node&.name == 'iframe'
|
53
|
+
node_classes = url_node['class'].to_s.split(' ') + iframe_classes
|
54
|
+
url_node['class'] = node_classes.uniq.join(' ')
|
55
|
+
url_node['title'] ||= iframe_title
|
56
|
+
# To always force the default iframe size, change ||= to =
|
57
|
+
url_node['width'] ||= default_width
|
58
|
+
url_node['height'] ||= default_height
|
59
|
+
end
|
60
|
+
|
61
|
+
@to_html = @node.to_html
|
62
|
+
end
|
63
|
+
|
64
|
+
def blank?
|
65
|
+
url.nil? || url.empty?
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require_relative '../fragment'
|
2
|
+
|
3
|
+
class OpenStax::Content::Fragment::Exercise < OpenStax::Content::Fragment
|
4
|
+
# CSS to find the embed code attributes
|
5
|
+
EXERCISE_EMBED_URL_CSS = 'a[href^="#"]'
|
6
|
+
|
7
|
+
# Regex to extract the appropriate tag from the embed code(s)
|
8
|
+
EXERCISE_EMBED_URL_REGEXES = {
|
9
|
+
tag: /\A#ost\/api\/ex\/([\w\s-]+)\z/,
|
10
|
+
nickname: /\A#exercises?\/([\w\s-]+)\z/
|
11
|
+
}
|
12
|
+
|
13
|
+
# CSS to find the exercise embed queries after the urls are absolutized
|
14
|
+
ABSOLUTIZED_EMBED_URL_CSS = 'a[href*="/api/exercises"]'
|
15
|
+
|
16
|
+
# Regex to extract the appropriate embed queries from the absolutized urls
|
17
|
+
ABSOLUTIZED_EMBED_URL_REGEX = \
|
18
|
+
/\/api\/exercises\/?\?q=(tag|nickname)(?::|%3A)(?:"|%22)?([\w\s%-]+?)(?:"|%22)?\z/
|
19
|
+
|
20
|
+
attr_reader :embed_queries
|
21
|
+
|
22
|
+
# This code is run from lib/openstax/cnx/v1/page.rb during import
|
23
|
+
def self.absolutize_exercise_urls!(node)
|
24
|
+
uri = Addressable::URI.parse OpenStax::Content.exercises_search_api_url
|
25
|
+
|
26
|
+
node.css(EXERCISE_EMBED_URL_CSS).each do |anchor|
|
27
|
+
href = anchor.attribute('href')
|
28
|
+
|
29
|
+
EXERCISE_EMBED_URL_REGEXES.each do |field, regex|
|
30
|
+
embed_match = regex.match(href.value)
|
31
|
+
next if embed_match.nil?
|
32
|
+
|
33
|
+
uri.query_values = { q: "#{field}:\"#{embed_match[1]}\"" }
|
34
|
+
href.value = uri.to_s
|
35
|
+
anchor['data-type'] = 'exercise'
|
36
|
+
break
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def initialize(node:, title: nil, labels: [])
|
42
|
+
super
|
43
|
+
|
44
|
+
@embed_queries = node.css(ABSOLUTIZED_EMBED_URL_CSS).map do |anchor|
|
45
|
+
url = anchor.attribute('href').value
|
46
|
+
match = ABSOLUTIZED_EMBED_URL_REGEX.match(url)
|
47
|
+
next if match.nil?
|
48
|
+
|
49
|
+
[ match[1].to_sym, URI.decode_www_form_component(match[2]) ]
|
50
|
+
end.compact
|
51
|
+
end
|
52
|
+
|
53
|
+
def blank?
|
54
|
+
embed_queries.empty? && (node_id.nil? || node_id.empty?)
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
|
3
|
+
require_relative '../fragment'
|
4
|
+
|
5
|
+
class OpenStax::Content::Fragment::Html < OpenStax::Content::Fragment
|
6
|
+
attr_reader :to_html
|
7
|
+
|
8
|
+
def initialize(node:, title: nil, labels: nil)
|
9
|
+
super
|
10
|
+
|
11
|
+
@node = Nokogiri::HTML.fragment node.to_html
|
12
|
+
@to_html = @node.to_html
|
13
|
+
end
|
14
|
+
|
15
|
+
def as_json(*args)
|
16
|
+
# Don't attempt to serialize @node (it would fail)
|
17
|
+
super.except('node')
|
18
|
+
end
|
19
|
+
|
20
|
+
def html?
|
21
|
+
!to_html.empty?
|
22
|
+
end
|
23
|
+
|
24
|
+
def blank?
|
25
|
+
!html?
|
26
|
+
end
|
27
|
+
|
28
|
+
def node
|
29
|
+
@node ||= Nokogiri::HTML.fragment to_html
|
30
|
+
end
|
31
|
+
|
32
|
+
def has_css?(css, custom_css)
|
33
|
+
!node.at_css(css, custom_css).nil?
|
34
|
+
end
|
35
|
+
|
36
|
+
def append(new_node)
|
37
|
+
(node.at_css('body') || node.root) << new_node
|
38
|
+
|
39
|
+
@to_html = node.to_html
|
40
|
+
end
|
41
|
+
|
42
|
+
def transform_links!
|
43
|
+
node.css('[href]').each do |link|
|
44
|
+
href = link.attributes['href']
|
45
|
+
uri = Addressable::URI.parse(href.value) rescue nil
|
46
|
+
|
47
|
+
# Modify only fragment-only links
|
48
|
+
next if uri.nil? || uri.absolute? || !uri.path.empty?
|
49
|
+
|
50
|
+
# Abort if there is no target or it contains double quotes
|
51
|
+
# or it's still present in this fragment
|
52
|
+
target = uri.fragment
|
53
|
+
next if target.nil? || target.empty? || target.include?('"') ||
|
54
|
+
node.at_css("[id=\"#{target}\"], [name=\"#{target}\"]")
|
55
|
+
|
56
|
+
# Change the link to point to the reference view
|
57
|
+
href.value = "#{@reference_view_url}##{target}"
|
58
|
+
end unless @reference_view_url.nil?
|
59
|
+
|
60
|
+
@to_html = node.to_html
|
61
|
+
end
|
62
|
+
end
|