usfx 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. data/README.md +46 -0
  2. data/lib/document.rb +122 -0
  3. data/lib/parser.rb +33 -0
  4. data/lib/usfx.rb +2 -0
  5. metadata +64 -0
@@ -0,0 +1,46 @@
1
+ # USFX
2
+
3
+ This is a very simple SAX Document and Nokogiri SAX Parser for reading [Unified Scripture Format XML](http://ebible.org/usfx/) (USFX) files as a stream and doing something interesting with the verse data.
4
+
5
+ I needed to get the World English Bible (WEB) into a database easily and efficiently, so this was born. This parser excludes footnotes and other content other than the verse text itself, which may or may not be what you want.
6
+
7
+ Pull requests welcome!
8
+
9
+ ## Install
10
+
11
+ ```
12
+ gem install usfx
13
+ ```
14
+
15
+ ## Use
16
+
17
+ You can get the World English Bible (WEB) in USFX format [here](http://ebible.org/web/). Unzip and put the usfx.xml file somewhere.
18
+
19
+ ```ruby
20
+ class MyDocument < USFX::Document
21
+ def verse(data)
22
+ # do something with verse data, which looks like:
23
+ # {
24
+ # book_num: 1,
25
+ # book_id: 'GEN',
26
+ # book: 'Genesis',
27
+ # chapter: 1,
28
+ # verse: 1,
29
+ # text: 'In the beginning, God created the heavens and the earth.'
30
+ # }
31
+ end
32
+ end
33
+
34
+ parser = USFX::Parser.new(MyDocument.new)
35
+ parser.parse(File.open('eng-web_usfx.xml'))
36
+ ```
37
+
38
+ ## License
39
+
40
+ Copyright (c) 2013 Tim Morgan
41
+
42
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
43
+
44
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
45
+
46
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,122 @@
1
+ require 'nokogiri'
2
+
3
+ module USFX
4
+
5
+ # The Document class used by Nokogiri SAX Parser,
6
+ # with an added "verse" event for convenience
7
+ class Document < Nokogiri::XML::SAX::Document
8
+
9
+ # list of book ids that should be skipped
10
+ IGNORE_BOOK_IDS = %w(FRT)
11
+
12
+ def initialize
13
+ super
14
+ @book_num = 0
15
+ end
16
+
17
+ # main dispatcher, calls other specific element event methods
18
+ def start_element(name, attributes)
19
+ case name
20
+ when 'book'
21
+ start_book(attributes)
22
+ when 'h'
23
+ start_book_title(attributes)
24
+ when 'c'
25
+ start_chapter(attributes)
26
+ when 'v'
27
+ start_verse(attributes)
28
+ when 've'
29
+ end_verse
30
+ when 'f'
31
+ start_footnote(attributes)
32
+ end
33
+ end
34
+
35
+ # dispatcher for closing tags
36
+ def end_element(name)
37
+ case name
38
+ when 'h'
39
+ end_book_title
40
+ when 'f'
41
+ end_footnote
42
+ end
43
+ end
44
+
45
+ # event fired when book is started
46
+ def start_book(attributes)
47
+ id = Hash[attributes]['id']
48
+ unless IGNORE_BOOK_IDS.include?(id)
49
+ @book_num += 1
50
+ @book_id = id
51
+ @mode = 'book'
52
+ @book = ''
53
+ end
54
+ end
55
+
56
+ # event fired when book title is started
57
+ def start_book_title(attributes)
58
+ @mode = 'book-title' if @mode == 'book'
59
+ end
60
+
61
+ # event fired when book title is ended
62
+ def end_book_title
63
+ @mode = nil
64
+ end
65
+
66
+ # event fired when chapter is started
67
+ def start_chapter(attributes)
68
+ @chapter = Hash[attributes]['id'].to_i
69
+ end
70
+
71
+ # event fired when verse is started
72
+ def start_verse(attributes)
73
+ @verse = Hash[attributes]['id'].to_i
74
+ @text = ''
75
+ @mode = 'verse'
76
+ end
77
+
78
+ # event fired when verse is ended
79
+ def end_verse
80
+ @mode = nil
81
+ verse(book_num: @book_num, book_id: @book_id, book: @book, chapter: @chapter, verse: @verse, text: @text)
82
+ end
83
+
84
+ # event fired when footnote is started
85
+ def start_footnote(attributes)
86
+ @mode = "footnote|#{@mode}"
87
+ end
88
+
89
+ # event fired when footnote is ended
90
+ def end_footnote
91
+ @mode = @mode.split('|').last
92
+ end
93
+
94
+ # event fired when encountering text data in a tag
95
+ def characters(string)
96
+ case @mode
97
+ when 'book-title'
98
+ @book << string unless string == "\n"
99
+ when 'verse'
100
+ @text << string
101
+ end
102
+ end
103
+
104
+ # event fired upon completion of a verse
105
+ # verse data is passed as a hash of the form:
106
+ #
107
+ # {
108
+ # book_num: 1,
109
+ # book_id: 'GEN',
110
+ # book: 'Genesis',
111
+ # chapter: 1,
112
+ # verse: 1,
113
+ # text: 'In the beginning, God created the heavens and the earth.'
114
+ # }
115
+ #
116
+ # By default, this event will print the raw verse data.
117
+ # Override in your subclass to do fun stuff.
118
+ def verse(data)
119
+ p(data)
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,33 @@
1
+ require 'nokogiri'
2
+
3
+ require_relative 'document'
4
+
5
+ module USFX
6
+
7
+ # A tiny wrapper around Nokogiri::XML::SAX::Parser, just for fun.
8
+ #
9
+ # Usage:
10
+ #
11
+ # parser = USFX::Parser.new
12
+ # parser.parse(File.open('eng-web_usfx.xml'))
13
+ #
14
+ # To use your own Document class:
15
+ #
16
+ # class MyDocument < USFX::Document
17
+ # def verse(data)
18
+ # # do something with verse data here
19
+ # end
20
+ # end
21
+ #
22
+ # parser = USFX::Parser.new(MyDocument.new)
23
+ # parser.parse(File.open('eng-web_usfx.xml'))
24
+ #
25
+ # (and with this example, USFX::Parser does absolutely nothing more than Nokogiri::XML::SAX::Parser)
26
+ #
27
+ class Parser < Nokogiri::XML::SAX::Parser
28
+ def initialize(document=Document.new)
29
+ super(document)
30
+ end
31
+ end
32
+
33
+ end
@@ -0,0 +1,2 @@
1
+ require_relative 'document'
2
+ require_relative 'parser'
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: usfx
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.2
6
+ platform: ruby
7
+ authors:
8
+ - Tim Morgan
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-21 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ prerelease: false
16
+ type: :runtime
17
+ name: nokogiri
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ! '>='
22
+ - !ruby/object:Gem::Version
23
+ version: 1.6.0
24
+ requirement: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.6.0
30
+ description:
31
+ email: tim@timmorgan.org
32
+ executables: []
33
+ extensions: []
34
+ extra_rdoc_files: []
35
+ files:
36
+ - README.md
37
+ - lib/document.rb
38
+ - lib/parser.rb
39
+ - lib/usfx.rb
40
+ homepage: https://github.com/seven1m/usfx
41
+ licenses: []
42
+ post_install_message:
43
+ rdoc_options: []
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ none: false
48
+ requirements:
49
+ - - ! '>='
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ! '>='
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubyforge_project:
60
+ rubygems_version: 1.8.25
61
+ signing_key:
62
+ specification_version: 3
63
+ summary: Ruby stream parser for Unified Scripture Format XML (USFX)
64
+ test_files: []