semantictext 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +11 -0
- data/README.rdoc +7 -8
- data/TODO.rdoc +6 -2
- data/doc/demo.script +13 -0
- data/lib/semantictext/default_tag_factory.rb +1 -1
- data/lib/semantictext/document.rb +1 -2
- data/lib/semantictext/ftptag.rb +1 -0
- data/lib/semantictext/heading.rb +9 -3
- data/lib/semantictext/httptag.rb +1 -1
- data/lib/semantictext/mailtotag.rb +1 -0
- data/lib/semantictext/rich_text_parser.rb +13 -7
- data/lib/semantictext/tag.rb +3 -3
- data/test/{bullet_test.rb → test_bullet.rb} +2 -2
- data/test/{bulleted_list_parser_test.rb → test_bulleted_list_parser.rb} +0 -0
- data/test/{dateextractor_test.rb → test_dateextractor.rb} +0 -0
- data/test/{default_tag_factory_test.rb → test_default_tag_factory.rb} +0 -0
- data/test/{parser_test.rb → test_document.rb} +33 -8
- data/test/{export_test.rb → test_export.rb} +2 -0
- data/test/{keywordextractor_test.rb → test_keywordextractor.rb} +0 -0
- data/testfiles/complex.art +7 -0
- data/testfiles/regression-exportsample.txt +5 -1
- metadata +19 -19
- data/CHANGELOG +0 -2
- data/lib/semantictext/link.rb +0 -9
data/CHANGELOG.rdoc
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
== CHANGELOG
|
2
|
+
|
3
|
+
=== 0.2.1
|
4
|
+
2009-12-05: added support for multi-level headings e.g. "!!second level heading"
|
5
|
+
|
6
|
+
=== 0.2.0
|
7
|
+
2009-12-03: replaced ST::Link with ST::HttpTag, ST::MailToTag and ST::FtpTag
|
8
|
+
2009-12-02: renamed SemanticText::Parser to SemanticText::Document
|
9
|
+
2009-12-02: added support for arbitrary tags
|
10
|
+
=== 0.0.1
|
11
|
+
|
data/README.rdoc
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
|
1
|
+
== Semantic Text
|
2
2
|
Semantic Text is a Domain-Specific text markup parser.
|
3
3
|
It takes a file or sequence of lines and returns an object model of the document,
|
4
4
|
including document metadata (e.g. doc creation time and title) and a tree of
|
5
5
|
interconnected objects describing the document structure.
|
6
6
|
|
7
|
-
|
7
|
+
=== How to use it
|
8
8
|
* You need to have installed gemcutter.org into your gem locations.
|
9
9
|
* Install with:
|
10
10
|
gem install semantictext
|
@@ -27,8 +27,9 @@ interconnected objects describing the document structure.
|
|
27
27
|
|
28
28
|
* *rdoc* http://www.greenbarsoft.co.uk/software/semantictext/rdoc/
|
29
29
|
* *source* http://github.com/dafydd/semantictext
|
30
|
+
* To build me, set an environment variable called *SANDBOX* to the directory above your semantictext directory. The tests need this to access test data.
|
30
31
|
|
31
|
-
|
32
|
+
=== Semantic Markup
|
32
33
|
Semantic text supports:
|
33
34
|
* document metadata
|
34
35
|
* section headers
|
@@ -36,15 +37,13 @@ Semantic text supports:
|
|
36
37
|
* paragraphs that contain markup tags
|
37
38
|
* inline hyperlinks for http: mailto: and ftp:
|
38
39
|
* markup tags within bullet points
|
40
|
+
* custom markup tags e.g. postal code, youtube video embed, ... whatever you define in a subclass of SemanticText::DefaultTagFactory
|
39
41
|
|
40
|
-
We intend to support these features in future:
|
41
|
-
* custom markup tags e.g. postal code, youtube video embed, ...
|
42
42
|
|
43
|
-
|
44
|
-
= Compatibility
|
43
|
+
=== Compatibility
|
45
44
|
This project is being developed on OS X. Automated testing for Linux will be included in future releases.
|
46
45
|
|
47
|
-
|
46
|
+
=== Licence
|
48
47
|
This is open source software and comes with no warranty. See COPYING for details.
|
49
48
|
|
50
49
|
http://www.greenbarsoft.co.uk
|
data/TODO.rdoc
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
==to do
|
2
|
-
* support urls as a special structure tag
|
3
|
-
* tighten up error test cases on ftptag, httptag and mailtotag
|
4
2
|
* improve testing by mocking out tag factory used in tests - consider how/whether to do this
|
3
|
+
* tighten up error test cases on ftptag, httptag and mailtotag
|
4
|
+
* add verbatim code sections between {{{ and }}} brackets at start of line
|
5
5
|
* support wikinames as a special custom tag
|
6
6
|
* fix path to testfiles so test can run on gem - tests only run on source at the moment
|
7
7
|
|
@@ -12,6 +12,10 @@
|
|
12
12
|
* pull out parsers for different parts and use the state pattern
|
13
13
|
|
14
14
|
==done
|
15
|
+
* added basic rcov test coverage task to Rakefile
|
16
|
+
* find a way to generate demo.txt file from rake
|
17
|
+
* make demo.txt be up-to-date as dependency of rdoc
|
18
|
+
* replace SemanticText::Link with SemanticText::HTTPTag, SemanticText::MailToTag and SemanticText::FTPTag
|
15
19
|
* support custom structure tags
|
16
20
|
* escape HTML < > and & on headings
|
17
21
|
* build gem from rakefile
|
data/doc/demo.script
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'semantictext'
|
2
|
+
|
3
|
+
p = SemanticText::Document.new
|
4
|
+
p.parse 'title: my happy document'
|
5
|
+
p.parse ''
|
6
|
+
p.parse "!I'm a title"
|
7
|
+
p.parse "I'm a paragraph..."
|
8
|
+
p.parse "... yep, I'm still that paragraph."
|
9
|
+
p.parse ''
|
10
|
+
p.parse "* I'm a bullet point"
|
11
|
+
p.parse "* I'm another bullet point"
|
12
|
+
|
13
|
+
puts p.export_html # this is how to export as a HTML fragment
|
@@ -4,7 +4,6 @@ require 'semantictext/keyword_extractor'
|
|
4
4
|
require 'semantictext/not_header_line'
|
5
5
|
require 'semantictext/paragraph'
|
6
6
|
require 'semantictext/span'
|
7
|
-
require 'semantictext/link'
|
8
7
|
require 'semantictext/tag'
|
9
8
|
require 'string'
|
10
9
|
require 'semantictext/bulletedlist'
|
@@ -108,7 +107,7 @@ module SemanticText
|
|
108
107
|
return
|
109
108
|
end
|
110
109
|
if (line.begins_with('!'))
|
111
|
-
|
110
|
+
@content << Heading.parse(line)
|
112
111
|
else
|
113
112
|
if (line.begins_with('*'))
|
114
113
|
if @bulleted_list_parser.nil?
|
data/lib/semantictext/ftptag.rb
CHANGED
data/lib/semantictext/heading.rb
CHANGED
@@ -1,14 +1,20 @@
|
|
1
1
|
module SemanticText
|
2
2
|
class Heading
|
3
|
-
attr_reader :text
|
3
|
+
attr_reader :text, :depth
|
4
4
|
|
5
|
-
|
5
|
+
def self.parse(line)
|
6
|
+
line =~ /(!+)(.*)/
|
7
|
+
Heading.new($2,$1.size)
|
8
|
+
end
|
9
|
+
|
10
|
+
def initialize(aTitle, depth)
|
6
11
|
@text = aTitle
|
12
|
+
@depth = depth
|
7
13
|
end
|
8
14
|
|
9
15
|
#export as html
|
10
16
|
def export_html
|
11
|
-
"\n<
|
17
|
+
"\n<h#{@depth}>#{ CGI.escapeHTML(@text)}</h#{@depth}>"
|
12
18
|
end
|
13
19
|
|
14
20
|
end
|
data/lib/semantictext/httptag.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'semantictext/span'
|
2
|
-
require 'semantictext/link'
|
3
2
|
require 'semantictext/tag'
|
4
3
|
|
5
4
|
require 'string'
|
@@ -15,7 +14,15 @@ module SemanticText
|
|
15
14
|
@tag_factory = tag_factory
|
16
15
|
end
|
17
16
|
|
18
|
-
|
17
|
+
private
|
18
|
+
|
19
|
+
def create_tag(regex, source)
|
20
|
+
source =~ regex
|
21
|
+
tag_name = $1
|
22
|
+
tag_value = $2
|
23
|
+
@tag_factory.create_tag(tag_name, tag_value)
|
24
|
+
end
|
25
|
+
|
19
26
|
def parse_text_for_urls(text, enclosing_element)
|
20
27
|
link_next = false
|
21
28
|
ignore_next_section = false
|
@@ -29,7 +36,7 @@ module SemanticText
|
|
29
36
|
ignore_next_section = false
|
30
37
|
else
|
31
38
|
if (link_next)
|
32
|
-
enclosing_element <<
|
39
|
+
enclosing_element << create_tag(/([^:]+):([^\]]*)/, section)
|
33
40
|
ignore_next_section = true
|
34
41
|
else
|
35
42
|
enclosing_element << Span.new(section)
|
@@ -40,15 +47,14 @@ module SemanticText
|
|
40
47
|
end
|
41
48
|
|
42
49
|
public
|
50
|
+
|
51
|
+
# I parse a line of text, pushing the elements into enclosing_element as I find them.
|
43
52
|
def parse(line, enclosing_element)
|
44
53
|
sections = line.split /(\[[^:]+:[^\]]+\])/
|
45
54
|
tag_next = false
|
46
55
|
sections.each do |section|
|
47
56
|
if (tag_next)
|
48
|
-
|
49
|
-
tag_name = $1
|
50
|
-
tag_value = $2
|
51
|
-
enclosing_element << @tag_factory.create_tag(tag_name, tag_value)
|
57
|
+
enclosing_element << create_tag(/\[([^:]+):([^\]]*)\]/, section)
|
52
58
|
else
|
53
59
|
parse_text_for_urls(section, enclosing_element)
|
54
60
|
end
|
data/lib/semantictext/tag.rb
CHANGED
@@ -15,9 +15,9 @@ class BulletTest < Test::Unit::TestCase
|
|
15
15
|
unit = SemanticText::Bullet.new(test_string, 1, SemanticText::RichTextParser.new(SemanticText::DefaultTagFactory.new))
|
16
16
|
assert_equal(1, unit.depth)
|
17
17
|
assert_element SemanticText::Span, "beginning ", unit.content[0]
|
18
|
-
assert_element SemanticText::
|
18
|
+
assert_element SemanticText::HTTPTag, "//www.example.com", unit.content[1]
|
19
19
|
assert_element SemanticText::Span, " moretext ", unit.content[2]
|
20
|
-
assert_element SemanticText::
|
20
|
+
assert_element SemanticText::HTTPTag, "//www.dafydd.net", unit.content[3]
|
21
21
|
assert_element SemanticText::Span, " ending text", unit.content[4]
|
22
22
|
end
|
23
23
|
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'test/unit'
|
2
2
|
require 'semantictext/document'
|
3
3
|
|
4
|
-
class
|
4
|
+
class TestDocument < Test::Unit::TestCase
|
5
5
|
|
6
6
|
def assert_element(element_class, text, actual)
|
7
7
|
assert_equal element_class, actual.class
|
@@ -63,9 +63,9 @@ EOF
|
|
63
63
|
test_lines.each {|line| unit.parse(line)}
|
64
64
|
result = unit.content[0]
|
65
65
|
assert_equal SemanticText::Paragraph, result.class
|
66
|
-
assert_element SemanticText::
|
66
|
+
assert_element SemanticText::HTTPTag, "//www.dafydd.net/foogoo?blah", result.content[0]
|
67
67
|
assert_element SemanticText::Span, " see? ", result.content[1]
|
68
|
-
assert_element SemanticText::
|
68
|
+
assert_element SemanticText::HTTPTag, "//www.example.com", result.content[2]
|
69
69
|
assert_element SemanticText::Span, "I wonder if it worked!", result.content[3]
|
70
70
|
assert_equal 4, result.content.size
|
71
71
|
end
|
@@ -76,20 +76,20 @@ EOF
|
|
76
76
|
|
77
77
|
Embedded link http://www.dafydd.net/foogoo?blah see?
|
78
78
|
I wonder if it worked!
|
79
|
-
a mailto:foogoo b ftp://
|
79
|
+
a mailto:foogoo b ftp://host/path c
|
80
80
|
EOF
|
81
81
|
test_lines.each {|line| unit.parse(line)}
|
82
82
|
|
83
83
|
result = unit.content[0]
|
84
84
|
assert_equal SemanticText::Paragraph, result.class
|
85
85
|
assert_element SemanticText::Span, "Embedded link ", result.content[0]
|
86
|
-
assert_element SemanticText::
|
86
|
+
assert_element SemanticText::HTTPTag, "//www.dafydd.net/foogoo?blah", result.content[1]
|
87
87
|
assert_element SemanticText::Span, " see?", result.content[2]
|
88
88
|
assert_element SemanticText::Span, "I wonder if it worked!", result.content[3]
|
89
89
|
assert_element SemanticText::Span, 'a ', result.content[4]
|
90
|
-
assert_element SemanticText::
|
90
|
+
assert_element SemanticText::MailToTag, 'foogoo', result.content[5]
|
91
91
|
assert_element SemanticText::Span, ' b ', result.content[6]
|
92
|
-
assert_element SemanticText::
|
92
|
+
assert_element SemanticText::FTPTag, '//host/path', result.content[7]
|
93
93
|
assert_element SemanticText::Span, ' c', result.content[8]
|
94
94
|
assert_equal 9, result.content.size
|
95
95
|
end
|
@@ -176,6 +176,31 @@ EOF
|
|
176
176
|
assert_equal 9, result.content.size
|
177
177
|
end
|
178
178
|
|
179
|
+
def test_heading_parsing
|
180
|
+
unit = SemanticText::Document.new
|
181
|
+
unit.parse ''
|
182
|
+
unit.parse '!1st level heading'
|
183
|
+
unit.parse '!!2nd level heading'
|
184
|
+
unit.parse '!!!3rd level heading'
|
185
|
+
|
186
|
+
first_heading = unit.content[0]
|
187
|
+
second_heading = unit.content[1]
|
188
|
+
third_heading = unit.content[2]
|
189
|
+
|
190
|
+
assert_equal "1st level heading", first_heading.text
|
191
|
+
assert_equal SemanticText::Heading, first_heading.class
|
192
|
+
assert_equal 1, first_heading.depth
|
193
|
+
|
194
|
+
|
195
|
+
assert_equal "2nd level heading", second_heading.text
|
196
|
+
assert_equal SemanticText::Heading, second_heading.class
|
197
|
+
assert_equal 2, second_heading.depth
|
198
|
+
|
199
|
+
assert_equal "3rd level heading", third_heading.text
|
200
|
+
assert_equal SemanticText::Heading, third_heading.class
|
201
|
+
assert_equal 3, third_heading.depth
|
202
|
+
end
|
203
|
+
|
179
204
|
def test_paragraphs_headings_and_bullet_points
|
180
205
|
unit = SemanticText::Document.new
|
181
206
|
unit.parse('')
|
@@ -231,7 +256,7 @@ EOF
|
|
231
256
|
assert_equal SemanticText::BulletedList, actual_list.class
|
232
257
|
|
233
258
|
assert_element SemanticText::Span, 'with url ', first_bullet.content[0]
|
234
|
-
assert_element SemanticText::
|
259
|
+
assert_element SemanticText::HTTPTag, '//www.example.com', first_bullet.content[1]
|
235
260
|
assert_element SemanticText::Span, ' see?', first_bullet.content[2]
|
236
261
|
|
237
262
|
assert_element SemanticText::Span, 'with tag ', second_bullet.content[0]
|
File without changes
|
data/testfiles/complex.art
CHANGED
@@ -11,6 +11,9 @@ This paragraph tests escaping < > &
|
|
11
11
|
|
12
12
|
Theis is the third paragraph.
|
13
13
|
|
14
|
+
!!second-level heading
|
15
|
+
!!!third-level heading
|
16
|
+
|
14
17
|
Hey dude, check out my website:
|
15
18
|
http://www.example.com Cool innit?
|
16
19
|
|
@@ -26,3 +29,7 @@ http://www.example.com Cool innit?
|
|
26
29
|
This is another paragraph. This is a [http://www.example.com] tag.
|
27
30
|
|
28
31
|
http://www.example.com/foo?a=b&c=d
|
32
|
+
|
33
|
+
ftp://host/path
|
34
|
+
|
35
|
+
mailto:fred@example.com
|
@@ -3,6 +3,8 @@
|
|
3
3
|
<p> This is another paragraph.</p>
|
4
4
|
<p> This paragraph tests escaping < > &</p>
|
5
5
|
<p> Theis is the third paragraph.</p>
|
6
|
+
<h2>second-level heading</h2>
|
7
|
+
<h3>third-level heading</h3>
|
6
8
|
<p> Hey dude, check out my website: <a href="http://www.example.com">http://www.example.com</a> Cool innit?</p>
|
7
9
|
<h1>Second Big Section < > &</h1>
|
8
10
|
<ul><li> point 1</li>
|
@@ -11,5 +13,7 @@
|
|
11
13
|
<ul><li> subpoint 2.1</li><li> subpoint 2.2</li>
|
12
14
|
</ul>
|
13
15
|
</ul>
|
14
|
-
<p> This is another paragraph. This is a
|
16
|
+
<p> This is another paragraph. This is a <a href="http://www.example.com">http://www.example.com</a> tag.</p>
|
15
17
|
<p><a href="http://www.example.com/foo?a=b&c=d">http://www.example.com/foo?a=b&c=d</a></p>
|
18
|
+
<p><a href="ftp://host/path">ftp://host/path</a></p>
|
19
|
+
<p><a href="mailto:fred@example.com">mailto:fred@example.com</a></p>
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: semantictext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dafydd Rees
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-12-
|
12
|
+
date: 2009-12-05 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -20,11 +20,12 @@ executables: []
|
|
20
20
|
extensions: []
|
21
21
|
|
22
22
|
extra_rdoc_files:
|
23
|
-
- CHANGELOG
|
23
|
+
- CHANGELOG.rdoc
|
24
24
|
- COPYING
|
25
25
|
- README.rdoc
|
26
26
|
- TODO.rdoc
|
27
27
|
files:
|
28
|
+
- doc/demo.script
|
28
29
|
- lib/semantictext/bullet.rb
|
29
30
|
- lib/semantictext/bulleted_list_parser.rb
|
30
31
|
- lib/semantictext/bulletedlist.rb
|
@@ -36,7 +37,6 @@ files:
|
|
36
37
|
- lib/semantictext/heading.rb
|
37
38
|
- lib/semantictext/httptag.rb
|
38
39
|
- lib/semantictext/keyword_extractor.rb
|
39
|
-
- lib/semantictext/link.rb
|
40
40
|
- lib/semantictext/mailtotag.rb
|
41
41
|
- lib/semantictext/not_header_line.rb
|
42
42
|
- lib/semantictext/paragraph.rb
|
@@ -47,17 +47,17 @@ files:
|
|
47
47
|
- lib/semantictext/unknown_tag.rb
|
48
48
|
- lib/semantictext.rb
|
49
49
|
- lib/string.rb
|
50
|
-
- test/
|
51
|
-
- test/
|
52
|
-
- test/
|
53
|
-
- test/
|
54
|
-
- test/
|
55
|
-
- test/
|
56
|
-
- test/
|
50
|
+
- test/test_bullet.rb
|
51
|
+
- test/test_bulleted_list_parser.rb
|
52
|
+
- test/test_dateextractor.rb
|
53
|
+
- test/test_default_tag_factory.rb
|
54
|
+
- test/test_document.rb
|
55
|
+
- test/test_export.rb
|
56
|
+
- test/test_keywordextractor.rb
|
57
57
|
- testfiles/complex.art
|
58
58
|
- testfiles/regression-exportsample.txt
|
59
59
|
- testfiles/simple.art
|
60
|
-
- CHANGELOG
|
60
|
+
- CHANGELOG.rdoc
|
61
61
|
- COPYING
|
62
62
|
- README.rdoc
|
63
63
|
- TODO.rdoc
|
@@ -90,10 +90,10 @@ signing_key:
|
|
90
90
|
specification_version: 3
|
91
91
|
summary: Domain-Specific text markup parser
|
92
92
|
test_files:
|
93
|
-
- ./test/
|
94
|
-
- ./test/
|
95
|
-
- ./test/
|
96
|
-
- ./test/
|
97
|
-
- ./test/
|
98
|
-
- ./test/
|
99
|
-
- ./test/
|
93
|
+
- ./test/test_bullet.rb
|
94
|
+
- ./test/test_bulleted_list_parser.rb
|
95
|
+
- ./test/test_dateextractor.rb
|
96
|
+
- ./test/test_default_tag_factory.rb
|
97
|
+
- ./test/test_document.rb
|
98
|
+
- ./test/test_export.rb
|
99
|
+
- ./test/test_keywordextractor.rb
|
data/CHANGELOG
DELETED