semantictext 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG ADDED
File without changes
data/COPYING ADDED
@@ -0,0 +1,32 @@
1
+ Semantic Text Licence
2
+
3
+ COPYRIGHT AND PERMISSION NOTICE
4
+
5
+ Copyright (c) 2009 Green Bar Software Limited, UK
6
+
7
+ All rights reserved.
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a
10
+ copy of this software and associated documentation files (the
11
+ "Software"), to deal in the Software without restriction, including
12
+ without limitation the rights to use, copy, modify, merge, publish,
13
+ distribute, and/or sell copies of the Software, and to permit persons
14
+ to whom the Software is furnished to do so, provided that the above
15
+ copyright notice(s) and this permission notice appear in all copies of
16
+ the Software and that both the above copyright notice(s) and this
17
+ permission notice appear in supporting documentation.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
22
+ OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
23
+ HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
24
+ INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
25
+ FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
26
+ NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
27
+ WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
28
+
29
+ Except as contained in this notice, the name of a copyright holder
30
+ shall not be used in advertising or otherwise to promote the sale, use
31
+ or other dealings in this Software without prior written authorization
32
+ of the copyright holder.
data/README.rdoc ADDED
@@ -0,0 +1,35 @@
1
+ = Semantic Text
2
+ Semantic Text is a Domain-Specific text markup parser.
3
+ It takes a file or sequence of lines and returns an object model of the document,
4
+ including document metadata (e.g. doc creation time and title) and a tree of
5
+ interconnected objects describing the document structure.
6
+
7
+ == RDOC API
8
+ The rdoc can be found at http://www.greenbarsoft.co.uk/software/semantictext/rdoc/
9
+
10
+ = How to use it
11
+ * Parse with SemanticText::Parser.parse_from(file)
12
+ * Generate HTML with SemanticText::Parser.export_html
13
+
14
+ = Semantic Markup
15
+ Semantic text supports:
16
+ * document metadata
17
+ * section headers
18
+ * nested bullet points
19
+ * paragraphs that contain markup tags
20
+ * inline hyperlinks for http: mailto: and ftp:
21
+ * markup tags within bullet points
22
+
23
+ We intend to support these features in future:
24
+ * custom markup tags e.g. postal code, youtube video embed, ...
25
+
26
+
27
+ == Compatibility
28
+ This project is being developed on OS X. Automated testing for Linux will be included in future releases.
29
+
30
+ == Licence
31
+ This is open source software and comes with no warranty. See COPYING for details.
32
+
33
+ http://www.greenbarsoft.co.uk
34
+
35
+ Copyright 2009 Green Bar Software Limited, UK
data/TODO.rdoc ADDED
@@ -0,0 +1,23 @@
1
+ ==to do
2
+ * support custom structure tags
3
+ * improve testing by mocking out tag factory used in tests - consider how/whether to do this
4
+ * support urls as a special structure tag
5
+ * support wikinames as a special custom tag
6
+ * publish rdoc
7
+ * build gem
8
+ * test gem
9
+ * publish gem
10
+
11
+ ==maybe
12
+ * think about how to support twitter with special structure tags e.g. #keyword and @user
13
+ * refactor parser into header parser and text parser
14
+ * pull out parsers for different parts and use the state pattern
15
+
16
+ ==done
17
+ * handle proper HTML escaping
18
+ * remove html and head elements from html output (it's here to be embedded in webpages)
19
+ * removed surrounding square brackets from text in Tag objects
20
+ * bullet lines should be parsed as paragraphs to support tags and inline hyperlinks
21
+ * supported paragraphs that start with links
22
+ * remove absolute paths from tests - use ENV['SANDBOX']
23
+
@@ -0,0 +1,22 @@
1
+ require 'semantictext/parser'
2
+
3
+ module SemanticText
4
+
5
+ class Bullet < Paragraph
6
+ attr_reader :depth
7
+ def initialize(text, depth, rich_text_parser)
8
+ super()
9
+ @depth = depth
10
+ rich_text_parser.parse(text, self)
11
+ end
12
+
13
+ #export as html
14
+ def export_html
15
+ result = "<li>"
16
+ content.each {|element| result+=element.export_html }
17
+ result += "</li>"
18
+ end
19
+
20
+ end
21
+
22
+ end
@@ -0,0 +1,21 @@
1
+ require 'semantictext/bulletedlist'
2
+ require 'semantictext/bullet'
3
+
4
+ module SemanticText
5
+ class BulletedListParser
6
+
7
+ attr_reader :bulleted_list
8
+
9
+ def initialize(rich_text_parser)
10
+ @rich_text_parser = rich_text_parser
11
+ @bulleted_list = BulletedList.new(1)
12
+ end
13
+
14
+ def parse_line(bulleted_line)
15
+ match = bulleted_line.match(/^(\*+)\s+(.*$)/)
16
+ depth = match[1].size
17
+ @bulleted_list << Bullet.new(match[2], depth, @rich_text_parser)
18
+ end
19
+
20
+ end
21
+ end
@@ -0,0 +1,36 @@
1
+ module SemanticText
2
+ class BulletedList
3
+ attr_reader :content
4
+ attr_reader :depth
5
+
6
+ def initialize(depth)
7
+ @content = []
8
+ @depth = depth
9
+ end
10
+
11
+ def <<(bullet)
12
+ if bullet.depth>@depth
13
+ if @content.last.class != BulletedList
14
+ @content << BulletedList.new(depth+1)
15
+ end
16
+ @content.last << bullet
17
+ else
18
+ @content << bullet
19
+ end
20
+ end
21
+
22
+ def size
23
+ @content.size
24
+ end
25
+
26
+ #export as html
27
+ def export_html
28
+ tabs = "\t"*depth
29
+ out = "\n#{tabs}<ul>"
30
+ content.each {|element| out=out+element.export_html}
31
+ out = out + "\n#{tabs}</ul>"
32
+ end
33
+
34
+ end
35
+
36
+ end
@@ -0,0 +1,27 @@
1
+ module SemanticText
2
+ class DateExtractor
3
+ MONTHS = { 'January' => 1,
4
+ 'February' => 2,
5
+ 'March' => 3,
6
+ 'April' =>4,
7
+ 'May' =>5,
8
+ 'June' =>6,
9
+ 'July' =>7,
10
+ 'August' =>8,
11
+ 'September' =>9,
12
+ 'October' =>10,
13
+ 'November' =>11,
14
+ 'December' =>12
15
+ }
16
+
17
+ def extract_from(string)
18
+ fields = string.split ' '
19
+ day = fields[0]
20
+ month = MONTHS[fields[1]]
21
+ throw ExtractionFailed.new if month.nil?
22
+ year = fields[2]
23
+
24
+ Time.local(year, month, day)
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,9 @@
1
+ module SemanticText
2
+ # I create SemanticText::Tag objects in response to create_tag(name,value) calls
3
+ # from a SemanticText::Parser
4
+ class DefaultTagFactory
5
+ def create_tag(name, value)
6
+ Tag.new(name,value)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,4 @@
1
+ module SemanticText
2
+ class ExtractionFailed < Exception
3
+ end
4
+ end
@@ -0,0 +1,15 @@
1
+ module SemanticText
2
+ class Heading
3
+ attr_reader :text
4
+
5
+ def initialize(aTitle)
6
+ @text = aTitle
7
+ end
8
+
9
+ #export as html
10
+ def export_html
11
+ "\n<h1>#{ CGI.escapeHTML(@text)}</h1>"
12
+ end
13
+
14
+ end
15
+ end
@@ -0,0 +1,9 @@
1
+ module SemanticText
2
+ class KeywordExtractor
3
+ def extract_from(string)
4
+ result = []
5
+ string.split(',').each { |keyword| result << keyword.strip }
6
+ return result
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ require 'cgi'
2
+ module SemanticText
3
+ class Link < Span
4
+ # export as html
5
+ def export_html
6
+ "<a href=\"#{text}\">#{CGI.escapeHTML(text)}</a>"
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,4 @@
1
+ module SemanticText
2
+ class NotHeaderLine < Exception
3
+ end
4
+ end
@@ -0,0 +1,24 @@
1
+ require 'semantictext/span'
2
+
3
+ module SemanticText
4
+
5
+ class Paragraph
6
+ attr_reader :content
7
+
8
+ def initialize()
9
+ @content = []
10
+ end
11
+
12
+ #export as html
13
+ def export_html
14
+ out = "\n<p>"
15
+ content.each {|element| out=out+element.export_html}
16
+ out = out + "</p>"
17
+ end
18
+
19
+ def <<(span)
20
+ @content << span
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,124 @@
1
+ require 'semantictext/extraction_failed'
2
+ require 'semantictext/heading'
3
+ require 'semantictext/keyword_extractor'
4
+ require 'semantictext/not_header_line'
5
+ require 'semantictext/paragraph'
6
+ require 'semantictext/span'
7
+ require 'semantictext/link'
8
+ require 'semantictext/tag'
9
+ require 'string'
10
+ require 'semantictext/bulletedlist'
11
+ require 'semantictext/bullet'
12
+ require 'semantictext/bulleted_list_parser'
13
+ require 'semantictext/rich_text_parser'
14
+
15
+ module SemanticText
16
+
17
+ class Parser
18
+ # title of the document
19
+ attr_reader :title
20
+
21
+ # date the document was created
22
+ attr_reader :createdAt
23
+
24
+ # keyword list for the current document
25
+ attr_reader :keywords
26
+
27
+ # pathname of the file currently being parsed (if it exists, nil otherwise)
28
+ attr_reader :pathname
29
+
30
+ # the object model of the parsed document
31
+ attr_reader :content
32
+
33
+ def initialize
34
+ @pathname=nil
35
+ @headers_completed = false
36
+ @content = []
37
+ @current_paragraph = nil
38
+ @bulleted_list_parser = nil
39
+ @rich_text_parser = RichTextParser.new(DefaultTagFactory.new)
40
+ end
41
+
42
+ # export as html
43
+ def export_html
44
+ out = ""
45
+ content.each {|element| out=out+element.export_html}
46
+ out = out + "\n"
47
+ end
48
+
49
+ # true iff I have seen the end of the headers section at the top of the document
50
+ def parameters_complete?
51
+ @headers_completed
52
+ end
53
+
54
+ # parse a document into this object from pathname specified by file
55
+ def parse_from(file)
56
+ @pathname=file
57
+ f = File.new(file)
58
+ f.each_line do |line|
59
+ parse(line)
60
+ end
61
+ f.close
62
+ end
63
+
64
+ # parse an individual <i>line</i> of String appending content
65
+ # into the current document held by this object
66
+ def parse(line)
67
+ line.chomp!
68
+ begin
69
+ if (!@headers_completed)
70
+ process_header_line(line)
71
+ else
72
+ parse_line(line)
73
+ end
74
+ rescue NotHeaderLine
75
+ @headers_completed = true
76
+ parse_line(line)
77
+ end
78
+ end
79
+
80
+ private
81
+
82
+ def process_header_line(headerLine)
83
+ splitLine = headerLine.split(':',2)
84
+ (attributeName, value) = splitLine
85
+ raise NotHeaderLine.new() if splitLine.size <2
86
+ attributeName.strip!
87
+ @title = value if attributeName=='title'
88
+ @createdAt = DateExtractor.new.extract_from(value) if attributeName=='createdAt'
89
+ @keywords = KeywordExtractor.new.extract_from(value) if attributeName=='keywords'
90
+ end
91
+
92
+ def parse_paragraph_line(line)
93
+ if @current_paragraph.nil?
94
+ @current_paragraph = Paragraph.new
95
+ @content << @current_paragraph
96
+ end
97
+ @rich_text_parser.parse(line, @current_paragraph)
98
+ end
99
+
100
+ def parse_line(line)
101
+ @bulleted_list_parser = nil if !line.begins_with '*'
102
+ if (line =='')
103
+ @current_paragraph = nil
104
+ @bulleted_list = nil
105
+ return
106
+ end
107
+ if (line.begins_with('!'))
108
+ @content << Heading.new(line[1,line.size-1])
109
+ else
110
+ if (line.begins_with('*'))
111
+ if @bulleted_list_parser.nil?
112
+ @bulleted_list_parser = BulletedListParser.new(@rich_text_parser)
113
+ @content << @bulleted_list_parser.bulleted_list
114
+ end
115
+ @bulleted_list_parser.parse_line(line)
116
+ else
117
+ parse_paragraph_line(line)
118
+ end
119
+ end
120
+ end
121
+
122
+ end
123
+
124
+ end
@@ -0,0 +1,60 @@
1
+ require 'semantictext/span'
2
+ require 'semantictext/link'
3
+ require 'semantictext/tag'
4
+
5
+ require 'string'
6
+ require 'semantictext/default_tag_factory'
7
+
8
+ module SemanticText
9
+
10
+ # I parse chunks of text into a sequence of spans, tags and urls.
11
+ class RichTextParser
12
+
13
+ # I need a tag_factory on which I can call create_tag(name, content_text) to create my tags.
14
+ def initialize(tag_factory)
15
+ @tag_factory = tag_factory
16
+ end
17
+
18
+ private
19
+ def parse_text_for_urls(text, enclosing_element)
20
+ link_next = false
21
+ ignore_next_section = false
22
+ sections = text.split /((http|ftp|mailto):[^ ]*)/
23
+ if sections[0]==''
24
+ ignore_next_section = true
25
+ link_next = true
26
+ end
27
+ sections.each do |section|
28
+ if (ignore_next_section)
29
+ ignore_next_section = false
30
+ else
31
+ if (link_next)
32
+ enclosing_element << Link.new(section)
33
+ ignore_next_section = true
34
+ else
35
+ enclosing_element << Span.new(section)
36
+ end
37
+ link_next = !(link_next)
38
+ end
39
+ end
40
+ end
41
+
42
+ public
43
+ def parse(line, enclosing_element)
44
+ sections = line.split /(\[[^:]+:[^\]]+\])/
45
+ tag_next = false
46
+ sections.each do |section|
47
+ if (tag_next)
48
+ section =~ /\[([^:]+):([^\]]*)\]/
49
+ tag_name = $1
50
+ tag_value = $2
51
+ enclosing_element << @tag_factory.create_tag(tag_name, tag_value)
52
+ else
53
+ parse_text_for_urls(section, enclosing_element)
54
+ end
55
+ tag_next = !(tag_next)
56
+ end
57
+ end
58
+
59
+ end
60
+ end
@@ -0,0 +1,17 @@
1
+ require 'cgi'
2
+ module SemanticText
3
+ class Span
4
+ attr_reader :text
5
+
6
+ def initialize(content)
7
+ @text = content
8
+ end
9
+
10
+ #export as html
11
+ def export_html
12
+ ' '+ CGI.escapeHTML(text)
13
+ end
14
+
15
+ end
16
+
17
+ end
@@ -0,0 +1,16 @@
1
+ module SemanticText
2
+ class Tag < Span
3
+ attr_reader :key
4
+
5
+ def initialize(key, value)
6
+ @text = value
7
+ @key = key
8
+ end
9
+
10
+ #export as html
11
+ def export_html
12
+ "[#{@key}:#{@text}]"
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1 @@
1
+ require 'semantictext/parser'
data/lib/string.rb ADDED
@@ -0,0 +1,5 @@
1
+ class String
2
+ def begins_with(substring)
3
+ self.index(substring)==0
4
+ end
5
+ end
@@ -0,0 +1,24 @@
1
+ require 'test/unit'
2
+ require 'semantictext/default_tag_factory'
3
+ require 'semantictext/rich_text_parser'
4
+ require 'semantictext/bullet'
5
+
6
+ class BulletTest < Test::Unit::TestCase
7
+
8
+ def assert_element(element_class, text, actual)
9
+ assert_equal element_class, actual.class
10
+ assert_equal text, actual.text
11
+ end
12
+
13
+ def test_bullet_line_with_links
14
+ test_string = "beginning http://www.example.com moretext http://www.dafydd.net ending text"
15
+ unit = SemanticText::Bullet.new(test_string, 1, SemanticText::RichTextParser.new(SemanticText::DefaultTagFactory.new))
16
+ assert_equal(1, unit.depth)
17
+ assert_element SemanticText::Span, "beginning ", unit.content[0]
18
+ assert_element SemanticText::Link, "http://www.example.com", unit.content[1]
19
+ assert_element SemanticText::Span, " moretext ", unit.content[2]
20
+ assert_element SemanticText::Link, "http://www.dafydd.net", unit.content[3]
21
+ assert_element SemanticText::Span, " ending text", unit.content[4]
22
+ end
23
+
24
+ end
@@ -0,0 +1,61 @@
1
+ require 'test/unit'
2
+ require 'semantictext/default_tag_factory'
3
+ require 'semantictext/rich_text_parser'
4
+ require 'semantictext/bulleted_list_parser'
5
+
6
+ class BulletedListParserTest < Test::Unit::TestCase
7
+
8
+ def assert_element(element_class, text, actual)
9
+ assert_equal element_class, actual.class
10
+ assert_equal text, actual.text
11
+ end
12
+
13
+ def test_creating_simple_bulleted_list
14
+ unit = SemanticText::BulletedListParser.new(SemanticText::RichTextParser.new(SemanticText::DefaultTagFactory.new))
15
+ unit.parse_line('* foogoo')
16
+ unit.parse_line('* second')
17
+ bulleted_list = unit.bulleted_list
18
+ assert_equal SemanticText::Bullet, unit.bulleted_list.content[0].class
19
+ assert_equal SemanticText::Bullet, unit.bulleted_list.content[1].class
20
+
21
+ span_line1 = unit.bulleted_list.content[0].content[0]
22
+ span_line2 = unit.bulleted_list.content[1].content[0]
23
+ assert_equal SemanticText::Span, span_line1.class, '1st elt of 1st bullet point should be a span'
24
+ assert_equal SemanticText::Span, span_line2.class, '1st elt of 2nd bullet point should be a span'
25
+
26
+ assert_equal 1, unit.bulleted_list.content[0].content.size, 'should only be 1 elt in 1st bullet point'
27
+ assert_equal 1, unit.bulleted_list.content[1].content.size, 'should only be 1 elt in 2nd bullet point'
28
+
29
+ assert_equal "foogoo", span_line1.text
30
+ assert_equal "second", span_line2.text
31
+ end
32
+
33
+ def test_nested_bulleting
34
+ unit = SemanticText::BulletedListParser.new(SemanticText::RichTextParser.new(SemanticText::DefaultTagFactory.new))
35
+ unit.parse_line('* top-level')
36
+ unit.parse_line('** nested')
37
+
38
+ assert_equal SemanticText::BulletedList, unit.bulleted_list.class
39
+ assert_equal 2, unit.bulleted_list.content.size
40
+
41
+ bullet1 = unit.bulleted_list.content[0]
42
+ assert_equal SemanticText::Bullet, bullet1.class
43
+ assert_equal SemanticText::Span, bullet1.content[0].class
44
+ assert_equal 'top-level', bullet1.content[0].text
45
+ assert_equal 1, bullet1.depth
46
+
47
+ nested_bulleted_list = unit.bulleted_list.content[1]
48
+ assert_equal SemanticText::BulletedList, nested_bulleted_list.class
49
+ assert_equal 2, nested_bulleted_list.depth
50
+ assert_equal 1, nested_bulleted_list.content.size
51
+
52
+ nested_bullet_point = nested_bulleted_list.content[0]
53
+ assert_equal SemanticText::Bullet, nested_bullet_point.class
54
+ assert_equal 2, nested_bullet_point.depth
55
+
56
+ span = nested_bullet_point.content[0]
57
+ assert_equal SemanticText::Span, span.class
58
+ assert_equal 'nested', span.text
59
+ end
60
+
61
+ end
@@ -0,0 +1,19 @@
1
+ require 'test/unit'
2
+ require 'semantictext/date_extractor'
3
+
4
+ class TestDateExtractor< Test::Unit::TestCase
5
+
6
+ def testExtractDateFromHappyString
7
+ unit = SemanticText::DateExtractor.new
8
+ result = unit.extract_from('5 November 2005')
9
+ assert_equal 5, result.day
10
+ assert_equal 11, result.month
11
+ assert_equal 2005, result.year
12
+ assert_equal Time, result.class
13
+ end
14
+
15
+ def testExtractRejectsInvalidMonth
16
+ unit = SemanticText::DateExtractor.new
17
+ assert_throws(:"SemanticText::ExtractionFailed") { unit.extract_from('5 x 2005')}
18
+ end
19
+ end
@@ -0,0 +1,50 @@
1
+ require 'test/unit'
2
+ require 'semantictext/parser'
3
+
4
+ class TestExport < Test::Unit::TestCase
5
+
6
+ def test_end_to_end_loading
7
+ unit = SemanticText::Parser.new
8
+ unit.parse_from(ENV['SANDBOX']+'/semantictext/testfiles/complex.art')
9
+ actual = unit.export_html
10
+ actual = actual.split /\n/
11
+
12
+ expected_file=File.new(ENV['SANDBOX']+'/semantictext/testfiles/regression-exportsample.txt')
13
+ expected = expected_file.readlines
14
+
15
+ (0..(expected.size-1)).each {|index| assert_equal expected[index],actual[index]+"\n"}
16
+ end
17
+
18
+ def test_escaping_paragraphs
19
+ unit = SemanticText::Parser.new
20
+ unit.parse ''
21
+ unit.parse 'escaping test < > &'
22
+
23
+ assert_equal "\n<p> escaping test &lt; &gt; &amp;</p>\n", unit.export_html
24
+ end
25
+
26
+ def test_escaping_headings
27
+ unit = SemanticText::Parser.new
28
+ unit.parse ''
29
+ unit.parse '!heading < > &'
30
+
31
+ assert_equal "\n<h1>heading &lt; &gt; &amp;</h1>\n", unit.export_html
32
+ end
33
+
34
+ def test_escaping_bullet_points
35
+ unit = SemanticText::Parser.new
36
+ unit.parse ''
37
+ unit.parse '* < > &'
38
+
39
+ assert_equal "\n\t<ul><li> &lt; &gt; &amp;</li>\n\t</ul>\n", unit.export_html
40
+ end
41
+
42
+ def test_escaping_link
43
+ unit = SemanticText::Parser.new
44
+ unit.parse ''
45
+ unit.parse 'http://www.example.com/app?name1=value1&name2=value2'
46
+
47
+ assert_equal "\n<p><a href=\"http://www.example.com/app?name1=value1&name2=value2\">http://www.example.com/app?name1=value1&amp;name2=value2</a></p>\n", unit.export_html
48
+ end
49
+
50
+ end
@@ -0,0 +1,13 @@
1
+ require 'test/unit'
2
+ require 'semantictext/keyword_extractor'
3
+
4
+ class TestKeywordExtractor< Test::Unit::TestCase
5
+
6
+ def test_extract_keywords_happy_case
7
+ unit = SemanticText::KeywordExtractor.new
8
+ assert_equal ['a','b','c','d'], unit.extract_from(' a , b , c , d ')
9
+ assert_equal ['a','b','c','d'], unit.extract_from(' a , b , c , d ')
10
+ assert_equal ['a','b','c','d'], unit.extract_from('a,b,c,d')
11
+ end
12
+
13
+ end
@@ -0,0 +1,292 @@
1
+ require 'test/unit'
2
+ require 'semantictext/parser'
3
+
4
+ class TestParser < Test::Unit::TestCase
5
+
6
+ def assert_element(element_class, text, actual)
7
+ assert_equal element_class, actual.class
8
+ assert_equal text, actual.text
9
+ end
10
+
11
+ def test_end_to_end_loading
12
+ unit = SemanticText::Parser.new
13
+ unit.parse_from(ENV['SANDBOX']+'/semantictext/testfiles/simple.art')
14
+ assert_equal 'test title', unit.title
15
+ assert_equal 5, unit.createdAt.day
16
+ assert_equal 11, unit.createdAt.month
17
+ assert_equal 2005, unit.createdAt.year
18
+ assert_equal Time, unit.createdAt.class
19
+ assert_equal ENV['SANDBOX']+'/semantictext/testfiles/simple.art', unit.pathname
20
+
21
+ resultant_heading = unit.content[0]
22
+ resultant_par0 = unit.content[1]
23
+ resultant_span0_0 = resultant_par0.content[0]
24
+ resultant_span0_1 = resultant_par0.content[1]
25
+ resultant_par1 = unit.content[2]
26
+ resultant_span1_0 = resultant_par1.content[0]
27
+
28
+ assert_element SemanticText::Heading, "First Big Heading", resultant_heading
29
+
30
+ assert_equal SemanticText::Paragraph, resultant_par0.class
31
+ assert_element SemanticText::Span, 'This is another', resultant_span0_0
32
+ assert_element SemanticText::Span, 'paragraph.', resultant_span0_1
33
+
34
+ assert_equal SemanticText::Paragraph, resultant_par1.class
35
+ assert_element SemanticText::Span, 'Theis is the second paragraph.', resultant_span1_0
36
+ end
37
+
38
+
39
+ def test_headerless_document_parse
40
+ unit = SemanticText::Parser.new
41
+ test_lines = <<EOF
42
+ !First Big Heading
43
+
44
+ This is some text.
45
+ EOF
46
+ test_lines.each {|line| unit.parse(line)}
47
+
48
+ assert_element SemanticText::Heading, "First Big Heading", unit.content[0]
49
+ assert_equal SemanticText::Paragraph, unit.content[1].class
50
+ assert_element SemanticText::Span, "This is some text.", unit.content[1].content[0]
51
+ assert_nil unit.title
52
+ assert_nil unit.createdAt
53
+ assert_nil unit.keywords
54
+ end
55
+
56
+ def test_parse_paragraph_beginning_with_url
57
+ unit = SemanticText::Parser.new
58
+ test_lines = <<EOF
59
+
60
+ http://www.dafydd.net/foogoo?blah see? http://www.example.com
61
+ I wonder if it worked!
62
+ EOF
63
+ test_lines.each {|line| unit.parse(line)}
64
+ result = unit.content[0]
65
+ assert_equal SemanticText::Paragraph, result.class
66
+ assert_element SemanticText::Link, "http://www.dafydd.net/foogoo?blah", result.content[0]
67
+ assert_element SemanticText::Span, " see? ", result.content[1]
68
+ assert_element SemanticText::Link, "http://www.example.com", result.content[2]
69
+ assert_element SemanticText::Span, "I wonder if it worked!", result.content[3]
70
+ assert_equal 4, result.content.size
71
+ end
72
+
73
+ def test_headerless_document_parse_with_url
74
+ unit = SemanticText::Parser.new
75
+ test_lines = <<EOF
76
+
77
+ Embedded link http://www.dafydd.net/foogoo?blah see?
78
+ I wonder if it worked!
79
+ a mailto:foogoo b ftp://asdfasdfasdf c
80
+ EOF
81
+ test_lines.each {|line| unit.parse(line)}
82
+
83
+ result = unit.content[0]
84
+ assert_equal SemanticText::Paragraph, result.class
85
+ assert_element SemanticText::Span, "Embedded link ", result.content[0]
86
+ assert_element SemanticText::Link, "http://www.dafydd.net/foogoo?blah", result.content[1]
87
+ assert_element SemanticText::Span, " see?", result.content[2]
88
+ assert_element SemanticText::Span, "I wonder if it worked!", result.content[3]
89
+ assert_element SemanticText::Span, 'a ', result.content[4]
90
+ assert_element SemanticText::Link, 'mailto:foogoo', result.content[5]
91
+ assert_element SemanticText::Span, ' b ', result.content[6]
92
+ assert_element SemanticText::Link, 'ftp://asdfasdfasdf', result.content[7]
93
+ assert_element SemanticText::Span, ' c', result.content[8]
94
+ assert_equal 9, result.content.size
95
+ end
96
+
97
+ def test_paragraph_parsing
98
+ unit = SemanticText::Parser.new
99
+ test_lines = <<EOF
100
+ title:test title
101
+ createdAt:5 November 2005
102
+ keywords: buzz, fuzz, muzz
103
+
104
+ !First Big Heading
105
+
106
+ This is another
107
+ paragraph.
108
+
109
+ Theis is the second paragraph.
110
+
111
+ EOF
112
+ test_lines.each {|line| unit.parse(line)}
113
+
114
+ assert_element SemanticText::Heading, "First Big Heading", unit.content[0]
115
+ assert_equal SemanticText::Paragraph, unit.content[1].class
116
+ assert_element SemanticText::Span, "This is another", unit.content[1].content[0]
117
+ assert_element SemanticText::Span, "paragraph.", unit.content[1].content[1]
118
+
119
+ assert_equal SemanticText::Paragraph, unit.content[2].class
120
+ assert_element SemanticText::Span, "Theis is the second paragraph.", unit.content[2].content[0]
121
+ assert_equal 3, unit.content.size
122
+ end
123
+
124
+ def test_parsing_of_parameters
125
+ unit = SemanticText::Parser.new
126
+ unit.parse('title:test title')
127
+ unit.parse('createdAt:5 November 2005')
128
+ unit.parse('keywords: buzz, fuzz, muzz')
129
+ unit.parse('')
130
+
131
+ assert_equal 'test title', unit.title
132
+ assert_equal 5, unit.createdAt.day
133
+ assert_equal 11, unit.createdAt.month
134
+ assert_equal 2005, unit.createdAt.year
135
+ assert_equal Time, unit.createdAt.class
136
+ assert unit.keywords == ['buzz', 'fuzz', 'muzz']
137
+ assert_equal 3, unit.keywords.size
138
+ assert_nil unit.pathname
139
+ assert unit.parameters_complete?
140
+ end
141
+
142
+ def test_presendence_of_url_lower_than_tag
143
+ unit = SemanticText::Parser.new
144
+ unit.parse('')
145
+ unit.parse('Embedded tag [http://www.dafydd.net/foogoo?blah name:here] see?')
146
+
147
+ result = unit.content[0]
148
+ assert_equal SemanticText::Paragraph, result.class
149
+ assert_element SemanticText::Span, "Embedded tag ", result.content[0]
150
+ assert_element SemanticText::Tag, "//www.dafydd.net/foogoo?blah name:here", result.content[1]
151
+ assert_element SemanticText::Span, " see?", result.content[2]
152
+ assert_equal 3, result.content.size
153
+ end
154
+
155
+ def test_headerless_document_parse_with_tags
156
+ unit = SemanticText::Parser.new
157
+ test_lines = <<EOF
158
+
159
+ Embedded tag [rfc:822] see?
160
+ I wonder if it worked!
161
+ a [tags:red balloon] b [c2:RecentChanges] c
162
+ EOF
163
+ test_lines.each {|line| unit.parse(line)}
164
+
165
+ result = unit.content[0]
166
+ assert_equal SemanticText::Paragraph, result.class
167
+ assert_element SemanticText::Span, "Embedded tag ", result.content[0]
168
+ assert_element SemanticText::Tag, "822", result.content[1]
169
+ assert_element SemanticText::Span, " see?", result.content[2]
170
+ assert_element SemanticText::Span, "I wonder if it worked!", result.content[3]
171
+ assert_element SemanticText::Span, 'a ', result.content[4]
172
+ assert_element SemanticText::Tag, 'red balloon', result.content[5]
173
+ assert_element SemanticText::Span, ' b ', result.content[6]
174
+ assert_element SemanticText::Tag, 'RecentChanges', result.content[7]
175
+ assert_element SemanticText::Span, ' c', result.content[8]
176
+ assert_equal 9, result.content.size
177
+ end
178
+
179
+ def test_paragraphs_headings_and_bullet_points
180
+ unit = SemanticText::Parser.new
181
+ unit.parse('')
182
+ unit.parse('!heading')
183
+ unit.parse('This is a paragraph')
184
+ unit.parse('* bullet point')
185
+ unit.parse('** nested bullet point')
186
+ unit.parse('')
187
+ unit.parse('* separate list')
188
+
189
+ assert_element SemanticText::Heading, "heading", unit.content[0]
190
+ assert_equal SemanticText::Paragraph, unit.content[1].class
191
+ assert_element SemanticText::Span, "This is a paragraph", unit.content[1].content[0]
192
+ assert_equal SemanticText::BulletedList, unit.content[2].class
193
+
194
+ bullets = unit.content[2]
195
+ assert_equal 2, bullets.content.size
196
+
197
+ bullet_1 = bullets.content[0]
198
+ assert_equal SemanticText::Bullet, bullet_1.class
199
+ assert_equal SemanticText::Span, bullet_1.content[0].class
200
+ assert_equal 1, bullet_1.content.size
201
+ assert_equal "bullet point", bullet_1.content[0].text
202
+
203
+ nested = bullets.content[1]
204
+ assert_equal SemanticText::BulletedList, nested.class
205
+ assert_equal 1, nested.content.size
206
+
207
+ nested_bullet = nested.content[0]
208
+
209
+ assert_equal SemanticText::Bullet, nested_bullet.class
210
+ assert_equal "nested bullet point", nested_bullet.content[0].text
211
+
212
+ second_bullets = unit.content[3]
213
+ assert_equal SemanticText::BulletedList, second_bullets.class
214
+ assert_equal 1, second_bullets.size
215
+
216
+ assert_equal SemanticText::Bullet, second_bullets.content[0].class
217
+ assert_equal SemanticText::Span, second_bullets.content[0].content[0].class
218
+ assert_equal "separate list", second_bullets.content[0].content[0].text
219
+ end
220
+
221
+ def test_bullet_points_with_urls_and_tags
222
+ unit = SemanticText::Parser.new
223
+ unit.parse('')
224
+ unit.parse('* with url http://www.example.com see?')
225
+ unit.parse('* with tag [c2:RecentChanges] see?')
226
+
227
+ actual_list = unit.content[0]
228
+ first_bullet = actual_list.content[0]
229
+ second_bullet = actual_list.content[1]
230
+
231
+ assert_equal SemanticText::BulletedList, actual_list.class
232
+
233
+ assert_element SemanticText::Span, 'with url ', first_bullet.content[0]
234
+ assert_element SemanticText::Link, 'http://www.example.com', first_bullet.content[1]
235
+ assert_element SemanticText::Span, ' see?', first_bullet.content[2]
236
+
237
+ assert_element SemanticText::Span, 'with tag ', second_bullet.content[0]
238
+ assert_element SemanticText::Tag, 'RecentChanges', second_bullet.content[1]
239
+ assert_element SemanticText::Span, ' see?', second_bullet.content[2]
240
+ end
241
+
242
+ def test_bulleted_list_nesting
243
+ unit = SemanticText::Parser.new
244
+ unit.parse('')
245
+ unit.parse('* separate list')
246
+ unit.parse('** nested bullet point 1')
247
+ unit.parse('** nested bullet point 2')
248
+ unit.parse('* top-level bullet point')
249
+
250
+ list = unit.content[0]
251
+ assert_equal SemanticText::BulletedList, list.class
252
+ assert_equal 3, list.content.size
253
+
254
+ bullet_1 = list.content[0]
255
+ assert_equal SemanticText::Bullet, bullet_1.class
256
+ assert_equal SemanticText::Span, bullet_1.content[0].class
257
+ assert_equal 'separate list', bullet_1.content[0].text
258
+
259
+ nested = list.content[1]
260
+ assert_equal SemanticText::BulletedList, nested.class
261
+ assert_equal 2, nested.content.size
262
+
263
+ bullet_2_1 = nested.content[0]
264
+ assert_equal SemanticText::Bullet, bullet_2_1.class
265
+ assert_equal SemanticText::Span, bullet_2_1.content[0].class
266
+ assert_equal "nested bullet point 1", bullet_2_1.content[0].text
267
+
268
+ bullet_2_2 = nested.content[1]
269
+ assert_equal SemanticText::Bullet, bullet_2_2.class
270
+ assert_equal SemanticText::Span, bullet_2_2.content[0].class
271
+ assert_equal "nested bullet point 2", bullet_2_2.content[0].text
272
+
273
+ bullet_3 = list.content[2]
274
+ assert_equal SemanticText::Bullet, bullet_3.class
275
+ assert_equal SemanticText::Span, bullet_3.content[0].class
276
+ assert_equal "top-level bullet point", bullet_3.content[0].text
277
+ end
278
+
279
+ def test_bulleted_lsit_parsing_into_two_separate_lists
280
+ unit = SemanticText::Parser.new
281
+ unit.parse('')
282
+ unit.parse('* first bullet in first list')
283
+ unit.parse('')
284
+ unit.parse('* second bullet in second list')
285
+
286
+ assert_equal SemanticText::BulletedList, unit.content[0].class
287
+ assert_equal SemanticText::BulletedList, unit.content[1].class
288
+
289
+ assert_not_same(unit.content[0], unit.content[1])
290
+ end
291
+
292
+ end
@@ -0,0 +1,28 @@
1
+ title:test title
2
+ createdAt:5 November 2005
3
+ keywords: buzz, fuzz, muzz
4
+
5
+ !First Big Heading
6
+
7
+ This is another
8
+ paragraph.
9
+
10
+ This paragraph tests escaping < > &
11
+
12
+ Theis is the third paragraph.
13
+
14
+ Hey dude, check out my website:
15
+ http://www.example.com Cool innit?
16
+
17
+ !Second Big Section < > &
18
+ * point 1
19
+ ** subpoint 1.1
20
+ ** subpoint 1.2
21
+ * point 2
22
+ * < > &
23
+ ** subpoint 2.1
24
+ ** subpoint 2.2
25
+
26
+ This is another paragraph. This is a [c2:RecentChanges] tag.
27
+
28
+ http://www.example.com/foo?a=b&c=d
@@ -0,0 +1,15 @@
1
+
2
+ <h1>First Big Heading</h1>
3
+ <p> This is another paragraph.</p>
4
+ <p> This paragraph tests escaping &lt; &gt; &amp;</p>
5
+ <p> Theis is the third paragraph.</p>
6
+ <p> Hey dude, check out my website: <a href="http://www.example.com">http://www.example.com</a> Cool innit?</p>
7
+ <h1>Second Big Section &lt; &gt; &amp;</h1>
8
+ <ul><li> point 1</li>
9
+ <ul><li> subpoint 1.1</li><li> subpoint 1.2</li>
10
+ </ul><li> point 2</li><li> &lt; &gt; &amp;</li>
11
+ <ul><li> subpoint 2.1</li><li> subpoint 2.2</li>
12
+ </ul>
13
+ </ul>
14
+ <p> This is another paragraph. This is a [c2:RecentChanges] tag.</p>
15
+ <p><a href="http://www.example.com/foo?a=b&c=d">http://www.example.com/foo?a=b&amp;c=d</a></p>
@@ -0,0 +1,10 @@
1
+ title:test title
2
+ createdAt:5 November 2005
3
+ keywords: buzz, fuzz, muzz
4
+
5
+ !First Big Heading
6
+
7
+ This is another
8
+ paragraph.
9
+
10
+ Theis is the second paragraph.
metadata ADDED
@@ -0,0 +1,92 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: semantictext
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Dafydd Rees
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-29 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Semantic Text is a system for marking up plain text documents with domain-specific tags.
17
+ email: os@greenbarsoft.co.uk
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - CHANGELOG
24
+ - COPYING
25
+ - README.rdoc
26
+ - TODO.rdoc
27
+ files:
28
+ - lib/semantictext/bullet.rb
29
+ - lib/semantictext/bulleted_list_parser.rb
30
+ - lib/semantictext/bulletedlist.rb
31
+ - lib/semantictext/date_extractor.rb
32
+ - lib/semantictext/default_tag_factory.rb
33
+ - lib/semantictext/extraction_failed.rb
34
+ - lib/semantictext/heading.rb
35
+ - lib/semantictext/keyword_extractor.rb
36
+ - lib/semantictext/link.rb
37
+ - lib/semantictext/not_header_line.rb
38
+ - lib/semantictext/paragraph.rb
39
+ - lib/semantictext/parser.rb
40
+ - lib/semantictext/rich_text_parser.rb
41
+ - lib/semantictext/span.rb
42
+ - lib/semantictext/tag.rb
43
+ - lib/semantictext.rb
44
+ - lib/string.rb
45
+ - test/bullet_test.rb
46
+ - test/bulleted_list_parser_test.rb
47
+ - test/dateextractor_test.rb
48
+ - test/export_test.rb
49
+ - test/keywordextractor_test.rb
50
+ - test/parser_test.rb
51
+ - testfiles/complex.art
52
+ - testfiles/regression-exportsample.txt
53
+ - testfiles/simple.art
54
+ - CHANGELOG
55
+ - COPYING
56
+ - README.rdoc
57
+ - TODO.rdoc
58
+ has_rdoc: true
59
+ homepage: http://www.greenbarsoft.co.uk/software/semantictext
60
+ licenses: []
61
+
62
+ post_install_message:
63
+ rdoc_options: []
64
+
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: "0"
72
+ version:
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: "0"
78
+ version:
79
+ requirements: []
80
+
81
+ rubyforge_project:
82
+ rubygems_version: 1.3.5
83
+ signing_key:
84
+ specification_version: 3
85
+ summary: Domain-Specific text markup parser
86
+ test_files:
87
+ - ./test/bullet_test.rb
88
+ - ./test/bulleted_list_parser_test.rb
89
+ - ./test/dateextractor_test.rb
90
+ - ./test/export_test.rb
91
+ - ./test/keywordextractor_test.rb
92
+ - ./test/parser_test.rb